resctrl_arch_rmid_read() could be called by resctrl in process context,
and then called by the PMU driver from irq context on the same CPU.
This could cause struct arch_mbm_state's prev_msr value to go backwards,
leading to the chunks value being incremented multiple times.
The struct arch_mbm_state holds both the previous msr value, and a count
of the number of chunks. These two fields need to be updated atomically.
Read the prev_msr before accessing the hardware, and cmpxchg() the value
back. If the value has changed, the whole thing is re-attempted.
Signed-off-by: James Morse <[email protected]>
---
arch/x86/kernel/cpu/resctrl/internal.h | 5 +++--
arch/x86/kernel/cpu/resctrl/monitor.c | 28 +++++++++++++++++++-------
2 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 6f18cf26988c..7960366b9434 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_RESCTRL_INTERNAL_H
#define _ASM_X86_RESCTRL_INTERNAL_H
+#include <linux/atomic.h>
#include <linux/resctrl.h>
#include <linux/sched.h>
#include <linux/kernfs.h>
@@ -338,8 +339,8 @@ struct mbm_state {
* find this struct.
*/
struct arch_mbm_state {
- u64 chunks;
- u64 prev_msr;
+ atomic64_t chunks;
+ atomic64_t prev_msr;
};
/**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e267869d60d5..1f470e55d555 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -225,13 +225,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
{
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct arch_mbm_state *am;
+ u64 msr_val;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
memset(am, 0, sizeof(*am));
/* Record any initial, non-zero count value. */
- __rmid_read(rmid, eventid, &am->prev_msr);
+ __rmid_read(rmid, eventid, &msr_val);
+ atomic64_set(&am->prev_msr, msr_val);
}
}
@@ -266,23 +268,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ u64 start_msr_val, old_msr_val, msr_val, chunks;
struct arch_mbm_state *am;
- u64 msr_val, chunks;
- int ret;
+ int ret = 0;
if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
return -EINVAL;
+interrupted:
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ start_msr_val = atomic64_read(&am->prev_msr);
+
ret = __rmid_read(rmid, eventid, &msr_val);
if (ret)
return ret;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
- am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
- hw_res->mbm_width);
- chunks = get_corrected_mbm_count(rmid, am->chunks);
- am->prev_msr = msr_val;
+ old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
+ msr_val);
+ if (old_msr_val != start_msr_val)
+ goto interrupted;
+
+ chunks = mbm_overflow_count(start_msr_val, msr_val,
+ hw_res->mbm_width);
+ atomic64_add(chunks, &am->chunks);
+
+ chunks = get_corrected_mbm_count(rmid,
+ atomic64_read(&am->chunks));
} else {
chunks = msr_val;
}
--
2.39.2
Hi James,
On Thu, May 25, 2023 at 8:03 PM James Morse <[email protected]> wrote:
> +interrupted:
> + am = get_arch_mbm_state(hw_dom, rmid, eventid);
> + if (am)
> + start_msr_val = atomic64_read(&am->prev_msr);
> +
> ret = __rmid_read(rmid, eventid, &msr_val);
> if (ret)
> return ret;
>
> am = get_arch_mbm_state(hw_dom, rmid, eventid);
> if (am) {
> - am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
> - hw_res->mbm_width);
> - chunks = get_corrected_mbm_count(rmid, am->chunks);
> - am->prev_msr = msr_val;
> + old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
> + msr_val);
> + if (old_msr_val != start_msr_val)
> + goto interrupted;
> +
> + chunks = mbm_overflow_count(start_msr_val, msr_val,
> + hw_res->mbm_width);
> + atomic64_add(chunks, &am->chunks);
> +
> + chunks = get_corrected_mbm_count(rmid,
> + atomic64_read(&am->chunks));
> } else {
> chunks = msr_val;
> }
It looks like if __rmid_read() is interrupted by an occupancy counter
read between writing QM_EVTSEL and reading QM_CTR, it will not perform
any update to am->prev_msr, and the interrupted read will return the
same counter value as in the interrupting read.
Maybe there's something you can create to check that's updated unconditionally?
-Peter
Hi Peter,
On 06/06/2023 09:49, Peter Newman wrote:
> On Thu, May 25, 2023 at 8:03 PM James Morse <[email protected]> wrote:
>> +interrupted:
>> + am = get_arch_mbm_state(hw_dom, rmid, eventid);
>> + if (am)
>> + start_msr_val = atomic64_read(&am->prev_msr);
>> +
>> ret = __rmid_read(rmid, eventid, &msr_val);
>> if (ret)
>> return ret;
>>
>> am = get_arch_mbm_state(hw_dom, rmid, eventid);
>> if (am) {
>> - am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
>> - hw_res->mbm_width);
>> - chunks = get_corrected_mbm_count(rmid, am->chunks);
>> - am->prev_msr = msr_val;
>> + old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
>> + msr_val);
>> + if (old_msr_val != start_msr_val)
>> + goto interrupted;
>> +
>> + chunks = mbm_overflow_count(start_msr_val, msr_val,
>> + hw_res->mbm_width);
>> + atomic64_add(chunks, &am->chunks);
>> +
>> + chunks = get_corrected_mbm_count(rmid,
>> + atomic64_read(&am->chunks));
>> } else {
>> chunks = msr_val;
>> }
>
> It looks like if __rmid_read() is interrupted by an occupancy counter
> read between writing QM_EVTSEL and reading QM_CTR, it will not perform
> any update to am->prev_msr, and the interrupted read will return the
> same counter value as in the interrupting read.
Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
__rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
making the window larger means you're more likely to see false positives.
----------------------------%<----------------------------
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
/monitor.c
index e24390d2e661..aeba035bb680 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
long val)
static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
{
+ u32 _rmid, _eventid;
u64 msr_val;
/*
@@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.data (bits 61:0) reports the monitored data.
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
+ * QM_EVTSEL is re-read to detect if this function was interrupted by
+ * another call, meaning the QM_CTR value may belong to a different
+ * event.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, msr_val);
+ do {
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+ rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
+ } while (eventid != _eventid || rmid != _rmid);
if (msr_val & RMID_VAL_ERROR)
return -EIO;
----------------------------%<----------------------------
Thanks!
James
Hi James,
On Tue, Jun 6, 2023 at 7:03 PM James Morse <[email protected]> wrote:
> On 06/06/2023 09:49, Peter Newman wrote:
> > It looks like if __rmid_read() is interrupted by an occupancy counter
> > read between writing QM_EVTSEL and reading QM_CTR, it will not perform
> > any update to am->prev_msr, and the interrupted read will return the
> > same counter value as in the interrupting read.
>
> Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
> I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
> __rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
> making the window larger means you're more likely to see false positives.
>
> ----------------------------%<----------------------------
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
> /monitor.c
> index e24390d2e661..aeba035bb680 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
> long val)
>
> static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
> {
> + u32 _rmid, _eventid;
> u64 msr_val;
>
> /*
> @@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
> * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
> * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
> * are error bits.
> + * QM_EVTSEL is re-read to detect if this function was interrupted by
> + * another call, meaning the QM_CTR value may belong to a different
> + * event.
> */
> - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> - rdmsrl(MSR_IA32_QM_CTR, msr_val);
> + do {
> + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
> + rdmsrl(MSR_IA32_QM_CTR, msr_val);
> + rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
> + } while (eventid != _eventid || rmid != _rmid);
>
> if (msr_val & RMID_VAL_ERROR)
> return -EIO;
I happen to be tracking the cost of resctrl_arch_rmid_read() calls, so
I measured the impact of your fix on my AMD EPYC 7B12:
with both this and the soft RMID series[1] applied:
Base switch 7955 0.23%
Hard RMID Switch 8476 6.80%
Soft RMID Switch 10173 28.17%
CLOSID+Soft RMID 10740 35.32%
then adding EVTSEL read-back patch:
Base switch 7985
Hard RMID Switch 8540 6.96%
Soft RMID Switch 11015 37.95%
CLOSID+Soft RMID 11590 45.16%
The Soft RMID switches contain two __rmid_read() calls, so this
implies each QM_EVTSEL read-back is around 420 cycles on this AMD
implementation.
Even if you don't agree with my plan to add resctrl_arch_rmid_read()
calls to context switches, there should be cheaper ways to handle
this.
-Peter
[1] https://lore.kernel.org/lkml/[email protected]/
Hi James,
On Thu, May 25, 2023 at 8:03 PM James Morse <[email protected]> wrote:
>
> resctrl_arch_rmid_read() could be called by resctrl in process context,
> and then called by the PMU driver from irq context on the same CPU.
Will there be x86 PMU changes to do this or is this only ARM? I just
want to make sure the x86 resctrl_arch_rmid_read() changes are
actually needed.
Thanks!
-Peter
Hi Peter,
On 6/8/23 09:53, Peter Newman wrote:
> On Thu, May 25, 2023 at 8:03 PM James Morse <[email protected]> wrote:
>>
>> resctrl_arch_rmid_read() could be called by resctrl in process context,
>> and then called by the PMU driver from irq context on the same CPU.
>
> Will there be x86 PMU changes to do this or is this only ARM? I just
> want to make sure the x86 resctrl_arch_rmid_read() changes are
> actually needed.
I plan to add that as an uncore 'resctrl_pmu' that works for all architectures by calling resctrl_arch_rmid_read(). Having that be specific to arm64 isn't helpful to user-space.
Perf is the natural home for these counters, and it will make it easier to add per-architecture
(or per-soc) counters, without having to teach resctrl about them.
This patch moved to be 'this' side of the 'move to /fs/resctrl' because this can happen
without the PMU changes. If a NOHZ_FULL CPU makes a syscall to read a counter, that happens
in process context, if a CPU in another domain wants to read the same counter, it has to send
an IPI which might target the same CPU.
(I've not investigated whether this is an existing bug)
Thanks,
James
Hi Peter,
On 6/7/23 13:51, Peter Newman wrote:
> On Tue, Jun 6, 2023 at 7:03 PM James Morse <[email protected]> wrote:
>> On 06/06/2023 09:49, Peter Newman wrote:
>>> It looks like if __rmid_read() is interrupted by an occupancy counter
>>> read between writing QM_EVTSEL and reading QM_CTR, it will not perform
>>> any update to am->prev_msr, and the interrupted read will return the
>>> same counter value as in the interrupting read.
>>
>> Yup, that's a problem. I was only looking at the mbm state in memory, not the CPU register.
>> I think the fix is to read back QM_EVTSEL after reading QM_CTR. I'll do this in
>> __rmid_read() to avoid returning -EINTR. It creates two retry loops which is annoying, but
>> making the window larger means you're more likely to see false positives.
>>
>> ----------------------------%<----------------------------
>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl
>> /monitor.c
>> index e24390d2e661..aeba035bb680 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -101,6 +101,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned
>> long val)
>>
>> static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>> {
>> + u32 _rmid, _eventid;
>> u64 msr_val;
>>
>> /*
>> @@ -110,9 +111,15 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>> * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
>> * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
>> * are error bits.
>> + * QM_EVTSEL is re-read to detect if this function was interrupted by
>> + * another call, meaning the QM_CTR value may belong to a different
>> + * event.
>> */
>> - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>> - rdmsrl(MSR_IA32_QM_CTR, msr_val);
>> + do {
>> + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>> + rdmsrl(MSR_IA32_QM_CTR, msr_val);
>> + rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
>> + } while (eventid != _eventid || rmid != _rmid);
>>
>> if (msr_val & RMID_VAL_ERROR)
>> return -EIO;
> I happen to be tracking the cost of resctrl_arch_rmid_read() calls, so
> I measured the impact of your fix on my AMD EPYC 7B12:
>
> with both this and the soft RMID series[1] applied:
> The Soft RMID switches contain two __rmid_read() calls, so this
> implies each QM_EVTSEL read-back is around 420 cycles on this AMD
> implementation.
Oooer. I assumed writes might have tedious side-effects but reads would cheap.
I suppose its because another CPU may have modified this value in the meantime.
> Even if you don't agree with my plan to add resctrl_arch_rmid_read()
> calls to context switches, there should be cheaper ways to handle
> this.
Yup, I've swapped this for a sequence counter[0], which should push that cost into the noise.
Anything left will be the cost of the atomics.
Thanks,
James
[0] barely tested:
------------------%<------------------
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 238831d53479..86d3a1b99be6 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,6 +16,7 @@
*/
#include <linux/module.h>
+#include <linux/percpu.h>
#include <linux/sizes.h>
#include <linux/slab.h>
@@ -24,6 +25,9 @@
#include "internal.h"
+/* Sequence number for writes to IA32 QM_EVTSEL */
+static DEFINE_PER_CPU(u64, qm_evtsel_seq);
+
struct rmid_entry {
/*
* Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
@@ -178,8 +182,7 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
{
- u32 _rmid, _eventid;
- u64 msr_val;
+ u64 msr_val, seq;
/*
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -188,15 +191,16 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.data (bits 61:0) reports the monitored data.
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
- * QM_EVTSEL is re-read to detect if this function was interrupted by
- * another call, meaning the QM_CTR value may belong to a different
- * event.
+ * A per-cpu sequence counter is incremented each time QM_EVTSEL is
+ * written. This is used to detect if this function was interrupted by
+ * another call without re-reading the MSRs. Retry the MSR read when
+ * this happens as the QM_CTR value may belong to a different event.
*/
do {
+ seq = this_cpu_inc_return(qm_evtsel_seq);
wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
rdmsrl(MSR_IA32_QM_CTR, msr_val);
- rdmsr(MSR_IA32_QM_EVTSEL, _eventid, _rmid);
- } while (eventid != _eventid || rmid != _rmid);
+ } while (seq != this_cpu_read(qm_evtsel_seq));
if (msr_val & RMID_VAL_ERROR)
return -EIO;
------------------%<------------------