2022-04-19 16:45:42

by Yazen Ghannam

[permalink] [raw]
Subject: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

Future Scalable MCA systems will include two new registers: MCA_SYND1
and MCA_SYND2.

These registers will include supplemental error information in addition
to the existing MCA_SYND register. The data within the registers is
considered valid if MCA_STATUS[SyndV] is set.

Add fields for these registers in struct mce. Save and print these
registers wherever MCA_STATUS[SyndV]/MCA_SYND is currently used.

Note: Checkpatch warnings/errors are ignored to maintain coding style.

Signed-off-by: Yazen Ghannam <[email protected]>
---
arch/x86/include/asm/mce.h | 5 +++++
arch/x86/include/uapi/asm/mce.h | 2 ++
arch/x86/kernel/cpu/mce/amd.c | 5 ++++-
arch/x86/kernel/cpu/mce/core.c | 9 ++++++++-
drivers/edac/mce_amd.c | 7 +++++--
include/trace/events/mce.h | 7 ++++++-
6 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cc73061e7255..25e3e2bc8c0a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,9 @@
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+/* Registers MISC2 to MISC4 are at offsets B to D. */
+#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e
+#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f
#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
@@ -126,6 +129,8 @@
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x))

#define XEC(x, mask) (((x) >> 16) & mask)

diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index db9adc081c5a..e77663a4abfa 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -36,6 +36,8 @@ struct mce {
__u64 ppin; /* Protected Processor Inventory Number */
__u32 microcode; /* Microcode revision */
__u64 kflags; /* Internal kernel use */
+ __u64 synd1; /* MCA_SYND1 MSR: only valid on SMCA systems */
+ __u64 synd2; /* MCA_SYND2 MSR: only valid on SMCA systems */
};

#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..23e34e5be7ed 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -750,8 +750,11 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
if (mce_flags.smca) {
rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);

- if (m.status & MCI_STATUS_SYNDV)
+ if (m.status & MCI_STATUS_SYNDV) {
rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), m.synd1);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), m.synd2);
+ }
}

mce_log(&m);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index d775fcd74e98..28e7a3c9ecfe 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -190,6 +190,10 @@ static void __print_mce(struct mce *m)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (m->synd1)
+ pr_cont("SYND1 %llx ", m->synd1);
+ if (m->synd2)
+ pr_cont("SYND2 %llx ", m->synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -647,8 +651,11 @@ static noinstr void mce_read_aux(struct mce *m, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));

- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ m->synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+ m->synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index cc5c63feb26a..28b48c711fe0 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1291,8 +1291,11 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (boot_cpu_has(X86_FEATURE_SMCA)) {
pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);

- if (m->status & MCI_STATUS_SYNDV)
- pr_cont(", Syndrome: 0x%016llx", m->synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ pr_cont(", Syndrome: 0x%016llx\n", m->synd);
+ pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
+ m->synd1, m->synd2);
+ }

pr_cont("\n");

diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
index 1391ada0da3b..a6826c34a185 100644
--- a/include/trace/events/mce.h
+++ b/include/trace/events/mce.h
@@ -22,6 +22,8 @@ TRACE_EVENT(mce_record,
__field( u64, addr )
__field( u64, misc )
__field( u64, synd )
+ __field( u64, synd1 )
+ __field( u64, synd2 )
__field( u64, ipid )
__field( u64, ip )
__field( u64, tsc )
@@ -42,6 +44,8 @@ TRACE_EVENT(mce_record,
__entry->addr = m->addr;
__entry->misc = m->misc;
__entry->synd = m->synd;
+ __entry->synd1 = m->synd1;
+ __entry->synd2 = m->synd2;
__entry->ipid = m->ipid;
__entry->ip = m->ip;
__entry->tsc = m->tsc;
@@ -55,12 +59,13 @@ TRACE_EVENT(mce_record,
__entry->cpuvendor = m->cpuvendor;
),

- TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
+ TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, IPID: %016Lx, ADDR/MISC/SYND: %016Lx/%016Lx/%016Lx, SYND1/SYND2: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
__entry->cpu,
__entry->mcgcap, __entry->mcgstatus,
__entry->bank, __entry->status,
__entry->ipid,
__entry->addr, __entry->misc, __entry->synd,
+ __entry->synd1, __entry->synd2,
__entry->cs, __entry->ip,
__entry->tsc,
__entry->cpuvendor, __entry->cpuid,
--
2.25.1


2022-06-30 11:49:13

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Apr 18, 2022 at 05:44:38PM +0000, Yazen Ghannam wrote:
> Future Scalable MCA systems will include two new registers: MCA_SYND1
> and MCA_SYND2.
>
> These registers will include supplemental error information in addition
> to the existing MCA_SYND register. The data within the registers is
> considered valid if MCA_STATUS[SyndV] is set.
>
> Add fields for these registers in struct mce. Save and print these
> registers wherever MCA_STATUS[SyndV]/MCA_SYND is currently used.

That's all fine and good but what kind of supplemental error information
are we talking about here? Example?

And how is that error info going to be used in luserspace?

I don't want to increase struct mce record size by 16 bytes and those
end up unused.

Can the information from MCA_SYND{,1,2} be synthesized into a smaller
quantity an then fed to userspace?

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-07-11 17:42:47

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Thu, Jun 30, 2022 at 01:01:58PM +0200, Borislav Petkov wrote:
> On Mon, Apr 18, 2022 at 05:44:38PM +0000, Yazen Ghannam wrote:
> > Future Scalable MCA systems will include two new registers: MCA_SYND1
> > and MCA_SYND2.
> >
> > These registers will include supplemental error information in addition
> > to the existing MCA_SYND register. The data within the registers is
> > considered valid if MCA_STATUS[SyndV] is set.
> >
> > Add fields for these registers in struct mce. Save and print these
> > registers wherever MCA_STATUS[SyndV]/MCA_SYND is currently used.
>
> That's all fine and good but what kind of supplemental error information
> are we talking about here? Example?
>
> And how is that error info going to be used in luserspace?
>

I think the general case will be more bank-specific information. For example,
if the bank is a cache type then the info one format and if the bank is a CPU
type then it'll be a different format, etc. So I think the new info will be
treated the same as the old info, i.e. collect all the raw data and share it
with a hardware debug person.

The one example where this is different is the "FRU Text" case covered in a
following patch in this set.

> I don't want to increase struct mce record size by 16 bytes and those
> end up unused.
>
> Can the information from MCA_SYND{,1,2} be synthesized into a smaller
> quantity an then fed to userspace?
>

I don't think so, at least not at the moment. There aren't any "architectural"
fields that can be interpreted the same accross multiple errors types and
banks.

Is your concern specifically on growing/changing struct mce, or is it more
about limiting info sent to userspace?

If it's the former, then I've been thinking it would be good to introduce a
new internal "struct mce_ext" that includes struct mce plus other things. This
way struct mce can still be uapi, and things like mcelog can use it. And at
the same time we can new data used in the kernel or shared through
tracepoints.

/* Extended MCE structure */
struct mce_ext {
struct mce *m;
/* new stuff here */
};

What do you think?

Thanks,
Yazen

2022-07-18 09:37:08

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Jul 11, 2022 at 05:31:41PM +0000, Yazen Ghannam wrote:
> Is your concern specifically on growing/changing struct mce, or is it more
> about limiting info sent to userspace?

Well, both, kinda.

> If it's the former, then I've been thinking it would be good to introduce a
> new internal "struct mce_ext" that includes struct mce plus other things. This
> way struct mce can still be uapi, and things like mcelog can use it. And at
> the same time we can new data used in the kernel or shared through
> tracepoints.
>
> /* Extended MCE structure */
> struct mce_ext {
> struct mce *m;
> /* new stuff here */
> };
>
> What do you think?

The moment you make it part of a trace record, it practically becomes
ABI.

So we could have some opaque blob which is called vendor-specific data
and which is dumped raw to userspace, without any specification to its
format so that luserspace doesn't get any ideas...

Lemme talk to rostedt.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-07-18 14:15:41

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Jul 18, 2022 at 10:57:19AM +0200, Borislav Petkov wrote:
> Lemme talk to rostedt.

Right, he says __dynamic_array(). Grep the tree for examples.

I'm thinking we can add a field called vendor_info or so, at the end of
the TP and then dump whatever fields we want there.

We can even slap a flag in front of the whole thing saying what the
vendor info type is. But we don't have to get ahead of ourselves and
keep it simple first.

How does that sound?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-08-02 12:34:18

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Jul 18, 2022 at 03:50:46PM +0200, Borislav Petkov wrote:
> On Mon, Jul 18, 2022 at 10:57:19AM +0200, Borislav Petkov wrote:
> > Lemme talk to rostedt.
>
> Right, he says __dynamic_array(). Grep the tree for examples.
>
> I'm thinking we can add a field called vendor_info or so, at the end of
> the TP and then dump whatever fields we want there.
>
> We can even slap a flag in front of the whole thing saying what the
> vendor info type is. But we don't have to get ahead of ourselves and
> keep it simple first.
>
> How does that sound?
>

Sounds okay to me. But how should this look in the internal kernel data
structures?

struct mce {
...
void *vendor_info; /* Points to a vendor-defined struct. */
};

..or..

struct mce_ext {
struct mce *m;
void *vendor_info;
};

I ask because struct mce is UAPI. But I think this is just for /dev/mcelog,
and this has been deprecated for a while. So on a related note, should
/dev/mcelog be removed and struct mce moved out of UAPI? Then changes to
struct mce won't affect user space, and we can just consider the mce trace
event when reporting to user space.

Thanks,
Yazen

2022-08-02 17:13:16

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

> I ask because struct mce is UAPI. But I think this is just for /dev/mcelog,
> and this has been deprecated for a while. So on a related note, should
> /dev/mcelog be removed and struct mce moved out of UAPI? Then changes to
> struct mce won't affect user space, and we can just consider the mce trace
> event when reporting to user space.

Even though deprecated, mcelog still has users, so I don't think it should go away
yet.

-Tony

2022-10-24 18:05:42

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Tue, Aug 02, 2022 at 12:22:20PM +0000, Yazen Ghannam wrote:
> I ask because struct mce is UAPI. But I think this is just for /dev/mcelog,
> and this has been deprecated for a while. So on a related note, should
> /dev/mcelog be removed and struct mce moved out of UAPI? Then changes to
> struct mce won't affect user space, and we can just consider the mce trace
> event when reporting to user space.

Question is, do you want those error records to be fed into mcelog on
AMD too?

And I remember you guys supporting it at some point.

The answer to that question will tell you how exactly to build your
structure of data you shuffle to luserspace.

I'd say.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-10-24 19:00:53

by Luck, Tony

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Oct 24, 2022 at 06:09:10PM +0200, Borislav Petkov wrote:
> On Tue, Aug 02, 2022 at 12:22:20PM +0000, Yazen Ghannam wrote:
> > I ask because struct mce is UAPI. But I think this is just for /dev/mcelog,
> > and this has been deprecated for a while. So on a related note, should
> > /dev/mcelog be removed and struct mce moved out of UAPI? Then changes to
> > struct mce won't affect user space, and we can just consider the mce trace
> > event when reporting to user space.
>
> Question is, do you want those error records to be fed into mcelog on
> AMD too?
>
> And I remember you guys supporting it at some point.
>
> The answer to that question will tell you how exactly to build your
> structure of data you shuffle to luserspace.

There are still a fair number of users of mcelog, so I think it needs
to remain in its half-undead state a while longer.

Changes to "struct mce" have always been supported. Several have
been made over the years. The rules are quite simple:

1) Do not remove any existing fields
2) Legacy fields that are no longer used should have value 0.
3) Kernel internal values (currently just "kflags") should be
zeroed in the structures passed out to user space.
3) New fields must be added at the end.

-Tony

2022-10-24 22:35:37

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Oct 24, 2022 at 09:38:44AM -0700, Tony Luck wrote:
> There are still a fair number of users of mcelog, so I think it needs
> to remain in its half-undead state a while longer.

That's not the question - the question is how should vendor-specific
info should be logged so that the struct mce record doesn't get blown
up or ends up containing unused fields on the other vendor.

I.e., how to keep it as small as possible and to share the space there
in the most compact way.

That vendor-specific "space" in there could be used by each vendor
differently. As in this case, Intel doesn't have MCA_SYND{1,2} u64
values. So they could be part of a vendor_info which gets interpreted
based on vendor.

When Intel wants to carry more info through struct mce to userspace,
it can reuse those 2 u64s which are vendor_info but interpret them
differently.

Which then begs the question, how should those get logged etc.

I guess a u8 vendor_info[VENDOR_INFO_SIZE] or so which we can extend
later if needed.

Perhaps prepend it with its length too:

error_record {
struct mce;
unsigned int vendor_info_len;
u8 vendor_info[vendor_info_len];
};

For example.

Not saying this is how it should be done - this is just what is swirling
around in my head right now.

Hmm.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-10-24 23:35:01

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Oct 24, 2022 at 09:08:54PM +0000, Luck, Tony wrote:
> We already have:
>
> __u8 cpuvendor; /* Kernel's X86_VENDOR enum */
>
> So the mce structure contains which vendor created it.
>
> > I guess a u8 vendor_info[VENDOR_INFO_SIZE] or so which we can extend
> > later if needed.
>
> Extending is hard because we already boxed in the two AMD specific fields
> with some generic fields that follow (ppin & microcode).
>
> But we could change the current form to be something like:
>
> union {
> struct vendor_info {
> __u64 vendor_info[2];
> };
> struct vendor_amd_info {
> __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
> __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
> };
> };
>
> to make it clear that these 16 bytes are up for grabs to be re-interpreted based on
> the value of "cpuvendor" (and possibly also "cpuid" if a vendor wants different data
> logged for different models).

Yes, there's that. But this thread contains a patch which wants to add
two more fields.

So the above you're proposing we could do if you wanna reuse that info
on Intel.

But again, there's this other question how to add a *new* vendor_info
at the *end* of the structure which can be shared too *and* which could
potentially be enlarged.

And, if we do

struct mce;
vendor_info;
<field shared by the two vendors>

then we're boxing in that vendor_info again and we cannot enlarge it
either.

That's why I'm proposing this prepended length in front of the
vendor_info field so that it can be extended properly in the future
and new shared members can also be added and this whole scheme can be
forward-compatible, so to speak, without having to cast anything in
stone.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-10-24 23:40:36

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

We already have:

__u8 cpuvendor; /* Kernel's X86_VENDOR enum */

So the mce structure contains which vendor created it.

> I guess a u8 vendor_info[VENDOR_INFO_SIZE] or so which we can extend
> later if needed.

Extending is hard because we already boxed in the two AMD specific fields
with some generic fields that follow (ppin & microcode).

But we could change the current form to be something like:

union {
struct vendor_info {
__u64 vendor_info[2];
};
struct vendor_amd_info {
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
};
};

to make it clear that these 16 bytes are up for grabs to be re-interpreted based on
the value of "cpuvendor" (and possibly also "cpuid" if a vendor wants different data
logged for different models).

-Tony

2022-10-24 23:42:07

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

> That's why I'm proposing this prepended length in front of the
> vendor_info field so that it can be extended properly in the future
> and new shared members can also be added and this whole scheme can be
> forward-compatible, so to speak, without having to cast anything in
> stone.

I missed the pre-pended length bit ... with that it all makes sense.

-Tony

2022-10-25 00:24:57

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

> I missed the pre-pended length bit ... with that it all makes sense.

Though the other place where mce records are visible to user space
is in trace records. See:

include/trace/events/mce.h

(N.B. this needs an update to include the ppin and microcode fields).

If these new fields need to be in the trace log, and we want to make
it easy to extend, then have to figure out how to handle this in a way
that doesn't confuse applications (rasdaemon ... are there others)
that consume the trace records.

-Tony

2022-10-25 16:58:29

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

> I do like the suggestion from Boris to have a length and vendor data in struct
> mce. This should keep mcelog happy while letting us treat struct mce as a
> semi-internal kernel structure. Also, this avoids having to mess around with
> all the notifier chain definitions.
>
> I'll start working on an implementation if that's okay with you all. I'll
> include kernel and rasdaemon patches together so we can have context for
> discussion.

Sounds good to me. I'll work on the mcelog changes once you have some
agreed definition for the "struct mce" changes.

-Tony

2022-10-25 17:08:02

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Mon, Oct 24, 2022 at 09:52:45PM +0000, Luck, Tony wrote:
> > I missed the pre-pended length bit ... with that it all makes sense.
>
> Though the other place where mce records are visible to user space
> is in trace records. See:
>
> include/trace/events/mce.h
>
> (N.B. this needs an update to include the ppin and microcode fields).
>
> If these new fields need to be in the trace log, and we want to make
> it easy to extend, then have to figure out how to handle this in a way
> that doesn't confuse applications (rasdaemon ... are there others)
> that consume the trace records.
>

Hi folks,

I think the "right way" to use tracepoints is to parse the data according to
the format provided by the tracepoint itself. You can see an example of
rasdaemon doing that here:
https://github.com/mchehab/rasdaemon/blob/c2255178a49f62c53009a456bc37dd5e37332f09/ras-mce-handler.c#L386

A tracepoint user should not assume a fixed struct layout, so adding and
rearranging fields shouldn't be a problem. I'm not sure about removing a
field. It seems to me that this should be okay in the interface, and it's up
to the application how it wants to handle a field that isn't found.

Also, rasdaemon does already support raw, variable-length data:
https://github.com/mchehab/rasdaemon/blob/master/ras-non-standard-handler.c

This could be an example used to update the MCE part.

I think the only (or popular?) userspace tool that relies on the layout of
struct mce is mcelog. This is not supported on modern AMD systems, and we
refer users to rasdaemon instead.

Another option could be to define a new tracepoint. Userspace already needs to
be updated to recognize new fields, so I don't think it's much of a stretch to
add a new tracepoint handler. This may be simpler than trying to fit
vendor-specific info into an existing tracepoint and then decoding it later in
userspace.

I do like the suggestion from Boris to have a length and vendor data in struct
mce. This should keep mcelog happy while letting us treat struct mce as a
semi-internal kernel structure. Also, this avoids having to mess around with
all the notifier chain definitions.

I'll start working on an implementation if that's okay with you all. I'll
include kernel and rasdaemon patches together so we can have context for
discussion.

Thanks,
Yazen

2022-10-25 18:57:09

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Tue, Oct 25, 2022 at 04:35:15PM +0000, Yazen Ghannam wrote:
> I think the "right way" to use tracepoints is to parse the data according to
> the format provided by the tracepoint itself. You can see an example of
> rasdaemon doing that here:
> https://github.com/mchehab/rasdaemon/blob/c2255178a49f62c53009a456bc37dd5e37332f09/ras-mce-handler.c#L386

Lemme add Rostedt.

So now we have libtraceevent and here's an example how to do it:

https://patchwork.kernel.org/project/linux-trace-devel/patch/[email protected]/
https://www.trace-cmd.org/Documentation/libtracefs/libtracefs-kprobes.html

Reportedly, rasdaemon is still using the old libtraceevent method. So it
probably should be updated to use the new library version.

> A tracepoint user should not assume a fixed struct layout, so adding
> and rearranging fields shouldn't be a problem. I'm not sure about
> removing a field. It seems to me that this should be okay in the
> interface, and it's up to the application how it wants to handle a
> field that isn't found.

From looking at the examples, I think the new libtraceevent should be
able to handle all that just fine.

> Another option could be to define a new tracepoint.

Yeah, no. Let's get this one to work pls.

trace_mce_record2() looks like the old attempts to design a syscall
better. :)

> Userspace already needs to be updated to recognize new fields, so I
> don't think it's much of a stretch to add a new tracepoint handler.

Syncing with other components is always a stretch. You're forgetting how
distros lag behind and don't always have the needed resources to update
their userspace.

Or they can't update because of enterprise raisins.

So no, it is not trivial, trust me. It's a bunch of politics and effort.

> This may be simpler than trying to fit vendor-specific info into an
> existing tracepoint and then decoding it later in userspace.

Until that new tracepoint becomes insufficient in the future and we need
to add trace_mce_record3(). No, you don't want that. :)

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-10-25 19:55:34

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Tue, 25 Oct 2022 20:05:48 +0200
Borislav Petkov <[email protected]> wrote:

> On Tue, Oct 25, 2022 at 04:35:15PM +0000, Yazen Ghannam wrote:
> > I think the "right way" to use tracepoints is to parse the data according to
> > the format provided by the tracepoint itself. You can see an example of
> > rasdaemon doing that here:
> > https://github.com/mchehab/rasdaemon/blob/c2255178a49f62c53009a456bc37dd5e37332f09/ras-mce-handler.c#L386
>
> Lemme add Rostedt.
>
> So now we have libtraceevent and here's an example how to do it:

Yes, I'm really grateful to Mauro for adapting an earlier version of
libtraceevent, although it was just cut and pasted into rasdaemon. But it
is time to use the official library, which had a bit of changes to the
interface.

>
> https://patchwork.kernel.org/project/linux-trace-devel/patch/[email protected]/
> https://www.trace-cmd.org/Documentation/libtracefs/libtracefs-kprobes.html
>
> Reportedly, rasdaemon is still using the old libtraceevent method. So it
> probably should be updated to use the new library version.

Definitely.

>
> > A tracepoint user should not assume a fixed struct layout, so adding
> > and rearranging fields shouldn't be a problem. I'm not sure about
> > removing a field. It seems to me that this should be okay in the
> > interface, and it's up to the application how it wants to handle a
> > field that isn't found.
>
> >From looking at the examples, I think the new libtraceevent should be
> able to handle all that just fine.

As long as the code can handle a field removed or renamed. It allows the
application to check if it is there or not.

>
> > Another option could be to define a new tracepoint.
>
> Yeah, no. Let's get this one to work pls.

You can always add a trace event on top of an existing trace event with a
"custom" trace event.

See https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/trace_events/trace_custom_sched.h

Also, take a look at some of the code that is coming with libtracefs:

https://patchwork.kernel.org/project/linux-trace-devel/list/?series=688772

(Note, it's not there just yet)

Basically you can just do:

instance = tracefs_instance_create(TOOL_NAME);

for (i = 0; i < nr_cpus; i++) {
tcpus[i] = tracefs_cpu_open(instance, i, false);

And then you can read from the raw trace buffers;

/* Read all "ras" events */
tep = tracefs_local_events_system(NULL, "ras");

/* Note pevent was renamed to tep */

/* I need to write up kbuffer man pages :-/ */
kbuf = kbuffer_alloc(sizeof(long) == 8, !tep_is_bigendian());

/* I guess you want to retrieve any data */
tracefs_instance_file_write(instance, "buffer_percent", "0");

buf = malloc(tracefs_cpu_read_size(tcpu));

/* false means block until there's data */
tracefs_cpu_read(tcpus[i], buf, false);

struct tep_record *record;
unsigned long long ts;

/* Load the read raw data into the kbuffer parser */
kbuffer_load_subbuffer(kbuf, buf);

for (;;) {
record.data = kbuffer_read_event(kbuf, &ts);
if (!record.data)
break;
record.ts = ts;

process(tep, record);

kbuffer_next_event(kbuf, NULL);

/* There's tracefs iterators that do this too, but I'm
* working on adding more features to them. */
}


static void process(struct tep_handle *tep, struct tep_record *record)
{
static struct tep_event *mc_event;
static struct tep_event *aer_event;
[..]
static struct trace_seq s;
struct tep_event *event;
unsigned long long val;

/* Initialize the static events to use them for data */
if (!mc_event) {
trace_seq_init(&s);
mc_event = tep_find_event_by_name(tep, "ras", "mc_event");
/* Do error checking? */
aer_event = tep_find_event_by_name(tep, "ras", "aer_event");
[..]
}

/* Remove any previous strings in s. */
trace_seq_reset(&s);

event = tep_find_event_by_record(tep, record);
if (event->id == mc_event->id) {
tep_get_field_val(&s, event, "address", record, &val, 0);
[..]
}
}


With libtracefs and libtraceevent, process trace events is so much easier
than it use to be!

-- Steve

2022-11-01 17:35:17

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH 1/3] x86/MCE, EDAC/mce_amd: Add support for new MCA_SYND{1,2} registers

On Tue, Oct 25, 2022 at 03:28:47PM -0400, Steven Rostedt wrote:

...

> With libtracefs and libtraceevent, process trace events is so much easier
> than it use to be!
>

Thanks Boris and Steve!

I'll start digging into this.

-Yazen