2024-02-06 19:01:49

by Marcelo Tosatti

[permalink] [raw]
Subject: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

Change timekeeping_notify to use stop_machine_fail when appropriate,
which will fail in case the target CPU is tagged as block interference CPU.

Signed-off-by: Marcelo Tosatti <[email protected]>

Index: linux-isolation/include/linux/clocksource.h
===================================================================
--- linux-isolation.orig/include/linux/clocksource.h
+++ linux-isolation/include/linux/clocksource.h
@@ -267,7 +267,7 @@ extern void clocksource_arch_init(struct
static inline void clocksource_arch_init(struct clocksource *cs) { }
#endif

-extern int timekeeping_notify(struct clocksource *clock);
+extern int timekeeping_notify(struct clocksource *clock, bool fail);

extern u64 clocksource_mmio_readl_up(struct clocksource *);
extern u64 clocksource_mmio_readl_down(struct clocksource *);
Index: linux-isolation/kernel/time/clocksource.c
===================================================================
--- linux-isolation.orig/kernel/time/clocksource.c
+++ linux-isolation/kernel/time/clocksource.c
@@ -125,7 +125,7 @@ static u64 suspend_start;

#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
-static void clocksource_select(void);
+static int clocksource_select(bool fail);

static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
@@ -679,7 +679,7 @@ static int clocksource_watchdog_kthread(
{
mutex_lock(&clocksource_mutex);
if (__clocksource_watchdog_kthread())
- clocksource_select();
+ clocksource_select(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -976,7 +976,7 @@ static struct clocksource *clocksource_f
return NULL;
}

-static void __clocksource_select(bool skipcur)
+static int __clocksource_select(bool skipcur, bool fail)
{
bool oneshot = tick_oneshot_mode_active();
struct clocksource *best, *cs;
@@ -984,7 +984,7 @@ static void __clocksource_select(bool sk
/* Find the best suitable clocksource */
best = clocksource_find_best(oneshot, skipcur);
if (!best)
- return;
+ return 0;

if (!strlen(override_name))
goto found;
@@ -1021,10 +1021,16 @@ static void __clocksource_select(bool sk
}

found:
- if (curr_clocksource != best && !timekeeping_notify(best)) {
+ if (curr_clocksource != best) {
+ int ret;
+
+ ret = timekeeping_notify(best, fail);
+ if (ret)
+ return ret;
pr_info("Switched to clocksource %s\n", best->name);
curr_clocksource = best;
}
+ return 0;
}

/**
@@ -1035,14 +1041,14 @@ found:
* Select the clocksource with the best rating, or the clocksource,
* which is selected by userspace override.
*/
-static void clocksource_select(void)
+static int clocksource_select(bool fail)
{
- __clocksource_select(false);
+ return __clocksource_select(false, fail);
}

-static void clocksource_select_fallback(void)
+static int clocksource_select_fallback(void)
{
- __clocksource_select(true);
+ return __clocksource_select(true, true);
}

/*
@@ -1061,7 +1067,7 @@ static int __init clocksource_done_booti
* Run the watchdog first to eliminate unstable clock sources
*/
__clocksource_watchdog_kthread();
- clocksource_select();
+ clocksource_select(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -1209,7 +1215,7 @@ int __clocksource_register_scale(struct
clocksource_enqueue_watchdog(cs);
clocksource_watchdog_unlock(&flags);

- clocksource_select();
+ clocksource_select(false);
clocksource_select_watchdog(false);
__clocksource_suspend_select(cs);
mutex_unlock(&clocksource_mutex);
@@ -1238,7 +1244,7 @@ void clocksource_change_rating(struct cl
__clocksource_change_rating(cs, rating);
clocksource_watchdog_unlock(&flags);

- clocksource_select();
+ clocksource_select(false);
clocksource_select_watchdog(false);
clocksource_suspend_select(false);
mutex_unlock(&clocksource_mutex);
@@ -1260,8 +1266,12 @@ static int clocksource_unbind(struct clo
}

if (cs == curr_clocksource) {
+ int ret;
+
/* Select and try to install a replacement clock source */
- clocksource_select_fallback();
+ ret = clocksource_select_fallback();
+ if (ret)
+ return ret;
if (curr_clocksource == cs)
return -EBUSY;
}
@@ -1352,17 +1362,17 @@ static ssize_t current_clocksource_store
struct device_attribute *attr,
const char *buf, size_t count)
{
- ssize_t ret;
+ ssize_t ret, err;

mutex_lock(&clocksource_mutex);

ret = sysfs_get_uname(buf, override_name, count);
if (ret >= 0)
- clocksource_select();
+ err = clocksource_select(true);

mutex_unlock(&clocksource_mutex);

- return ret;
+ return err ? err : ret;
}
static DEVICE_ATTR_RW(current_clocksource);

Index: linux-isolation/kernel/time/timekeeping.c
===================================================================
--- linux-isolation.orig/kernel/time/timekeeping.c
+++ linux-isolation/kernel/time/timekeeping.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
+#include <linux/sched/isolation.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
@@ -1497,13 +1498,24 @@ static int change_clocksource(void *data
* This function is called from clocksource.c after a new, better clock
* source has been registered. The caller holds the clocksource_mutex.
*/
-int timekeeping_notify(struct clocksource *clock)
+int timekeeping_notify(struct clocksource *clock, bool fail)
{
struct timekeeper *tk = &tk_core.timekeeper;

if (tk->tkr_mono.clock == clock)
return 0;
- stop_machine(change_clocksource, clock, NULL);
+
+ if (!fail)
+ stop_machine(change_clocksource, clock, NULL);
+ else {
+ int ret, idx;
+
+ idx = block_interf_srcu_read_lock();
+ ret = stop_machine_fail(change_clocksource, clock, NULL);
+ block_interf_srcu_read_unlock(idx);
+ if (ret)
+ return ret;
+ }
tick_clock_notify();
return tk->tkr_mono.clock == clock ? 0 : -1;
}




2024-02-07 11:57:31

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

On Tue, Feb 06 2024 at 15:49, Marcelo Tosatti wrote:
> Change timekeeping_notify to use stop_machine_fail when appropriate,
> which will fail in case the target CPU is tagged as block interference
> CPU.

You completely fail to explain 'appropriate'. There is zero reason for
this churn, really.


2024-02-07 13:25:02

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

On Wed, Feb 07, 2024 at 12:57:19PM +0100, Thomas Gleixner wrote:
> On Tue, Feb 06 2024 at 15:49, Marcelo Tosatti wrote:
> > Change timekeeping_notify to use stop_machine_fail when appropriate,
> > which will fail in case the target CPU is tagged as block interference
> > CPU.
>
> You completely fail to explain 'appropriate'. There is zero reason for
> this churn, really.

The churn is so that we can return an error to
current_clocksource_store (sysfs handler for writes to
/sys/devices/system/clocksource/clocksource0/current_clocksource).

Yeah, should probably separate patches that prepare code for
returning errors from the actual change between smp_call_function_single
and smp_call_function_single_fail (started that for the
rdmsr_on_cpu/wrmsr_on_cpu helpers).

OK, so i am assuming you are good with the general idea of the patch
(and have no better one) which is:

idx = block_interf_srcu_read_lock();
ret = smp_call_function_single_fail(cpu, remote_fn, ...); (or stop_machine_fail)
block_interf_srcu_read_unlock(idx);





2024-02-08 15:24:20

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

On Wed, Feb 07 2024 at 09:58, Marcelo Tosatti wrote:
> On Wed, Feb 07, 2024 at 12:57:19PM +0100, Thomas Gleixner wrote:
>> On Tue, Feb 06 2024 at 15:49, Marcelo Tosatti wrote:
>> > Change timekeeping_notify to use stop_machine_fail when appropriate,
>> > which will fail in case the target CPU is tagged as block interference
>> > CPU.
>>
>> You completely fail to explain 'appropriate'. There is zero reason for
>> this churn, really.
>
> The churn is so that we can return an error to
> current_clocksource_store (sysfs handler for writes to
> /sys/devices/system/clocksource/clocksource0/current_clocksource).

What for? Why?

Writing to that file requires root. Root can rightfully screw up a
system and adding a debugfs based "prevention" mechanism is not making
this any better because root can just clear the CPU mask there and move
on.

So what is the actual real world problem solved by these patches?

All I've seen so far is handwaving about interference prevention and TBH
I can't squint hard enough to believe that.

Thanks,

tglx

2024-02-09 16:13:49

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

On Thu, Feb 08, 2024 at 04:23:58PM +0100, Thomas Gleixner wrote:
> On Wed, Feb 07 2024 at 09:58, Marcelo Tosatti wrote:
> > On Wed, Feb 07, 2024 at 12:57:19PM +0100, Thomas Gleixner wrote:
> >> On Tue, Feb 06 2024 at 15:49, Marcelo Tosatti wrote:
> >> > Change timekeeping_notify to use stop_machine_fail when appropriate,
> >> > which will fail in case the target CPU is tagged as block interference
> >> > CPU.
> >>
> >> You completely fail to explain 'appropriate'. There is zero reason for
> >> this churn, really.
> >
> > The churn is so that we can return an error to
> > current_clocksource_store (sysfs handler for writes to
> > /sys/devices/system/clocksource/clocksource0/current_clocksource).
>
> What for? Why?
>
> Writing to that file requires root. Root can rightfully screw up a
> system and adding a debugfs based "prevention" mechanism is not making
> this any better because root can just clear the CPU mask there and move
> on.
>
> So what is the actual real world problem solved by these patches?
>
> All I've seen so far is handwaving about interference prevention and TBH
> I can't squint hard enough to believe that.
>
> Thanks,
>
> tglx


Thomas,

The problem is an administrator is not aware of the relationship between

Kernel interface -> generation of IPIs.


Even less so of which applications in the system are accessing
which kernel interfaces.

Let me give some examples:

1) Change of trip temperatures from userspace.

static int
sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp)
{
struct zone_device *zonedev = thermal_zone_device_priv(tzd);
u32 l, h, mask, shift, intr;
int tj_max, val, ret;

tj_max = intel_tcc_get_tjmax(zonedev->cpu);
if (tj_max < 0)
return tj_max;
tj_max *= 1000;

val = (tj_max - temp)/1000;

if (trip >= MAX_NUMBER_OF_TRIPS || val < 0 || val > 0x7f)
return -EINVAL;

ret = rdmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
&l, &h);
if (ret < 0)
return ret;

if (trip) {
mask = THERM_MASK_THRESHOLD1;
shift = THERM_SHIFT_THRESHOLD1;
intr = THERM_INT_THRESHOLD1_ENABLE;
} else {
mask = THERM_MASK_THRESHOLD0;
shift = THERM_SHIFT_THRESHOLD0;
intr = THERM_INT_THRESHOLD0_ENABLE;
}
l &= ~mask;
/*
* When users space sets a trip temperature == 0, which is indication
* that, it is no longer interested in receiving notifications.
*/
if (!temp) {
l &= ~intr;
} else {
l |= val << shift;
l |= intr;
}

return wrmsr_on_cpu(zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT,
l, h);
}

It seems plausible that userspace application decides to change trip temperature.

2) Read of per-CPU registers (from sysfs).

arch/arm64/kernel/topology.c

static inline
int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val)
{
/*
* Abort call on counterless CPU or when interrupts are
* disabled - can lead to deadlock in smp sync call.
*/
if (!cpu_has_amu_feat(cpu))
return -EOPNOTSUPP;

if (WARN_ON_ONCE(irqs_disabled()))
return -EPERM;

smp_call_function_single(cpu, func, val, 1);

return 0;
}

sysfs read -> cppc_get_perf_caps -> cpc_read -> cpc_read_ffh -> counters_read_on_cpu

#define define_one_cppc_ro(_name) \
static struct kobj_attribute _name = \
__ATTR(_name, 0444, show_##_name, NULL)

This one does not even require root.

Other interfaces on the same class:

show_pw20_wait_time (PowerPC)
uncore_read_freq (x86)
..

So while I understand your point that root can (and should be)
able to perform any operations on the system, this interface would
be along the lines of:

"I don't want isolated CPUs to be interrupted, but i am not aware of
which kernel interfaces can result in interruptions to isolated CPUs.

Lets indicate through this cpumask, which the kernel can consult,
that we'd like userspace operations to fail, if they were going
to interrupt an isolated CPU".

Its the kernel that knows which operations might interrupt
isolated CPUs, not userspace.

Also https://www.spinics.net/lists/kernel/msg5094328.html.


2024-02-12 15:29:25

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 05/12] timekeeping_notify: use stop_machine_fail when appropriate

On Fri, Feb 09 2024 at 12:30, Marcelo Tosatti wrote:
> On Thu, Feb 08, 2024 at 04:23:58PM +0100, Thomas Gleixner wrote:
> So while I understand your point that root can (and should be)
> able to perform any operations on the system, this interface would
> be along the lines of:
>
> "I don't want isolated CPUs to be interrupted, but i am not aware of
> which kernel interfaces can result in interruptions to isolated CPUs.
>
> Lets indicate through this cpumask, which the kernel can consult,
> that we'd like userspace operations to fail, if they were going
> to interrupt an isolated CPU".
>
> Its the kernel that knows which operations might interrupt
> isolated CPUs, not userspace.

Right, but you cannot just throw a CPU mask in and decide that it will
work for everything.

As I explained to you before, these interfaces cannot be treated in the
same way just because they might end up in a SMP function call.

You are definining a binary all or nothing policy which is the worst
thing you can do, because it prevents any fine grained decision and in
case an interface is needed for a particular operation it requires to
open up the gates for everything. Works for me and scratches my itch are
not really valid engineering principles.

Not to talk about the blind replacement of the SMP function call which
causes inconsistent state as I pointed out to you.

Thanks

tglx