As a temporary storage, staged_config[] in rdt_domain should be cleared
before and after it is used. The stale value in staged_config[] could
cause an MSR access error.
Here is a reproducer on a system with 16 usable CLOSIDs for a 15-way L3
Cache (MBA should be disabled if the number of CLOSIDs for MB is less than
16.) :
mount -t resctrl resctrl -o cdp /sys/fs/resctrl
mkdir /sys/fs/resctrl/p{1..7}
umount /sys/fs/resctrl/
mount -t resctrl resctrl /sys/fs/resctrl
mkdir /sys/fs/resctrl/p{1..8}
An error occurs when creating resource group named p8:
unchecked MSR access error: WRMSR to 0xca0 (tried to write 0x00000000000007ff) at rIP: 0xffffffff82249142 (cat_wrmsr+0x32/0x60)
Call Trace:
<IRQ>
__flush_smp_call_function_queue+0x11d/0x170
__sysvec_call_function+0x24/0xd0
sysvec_call_function+0x89/0xc0
</IRQ>
<TASK>
asm_sysvec_call_function+0x16/0x20
When creating a new resource control group, hardware will be configured
by the following process:
rdtgroup_mkdir()
rdtgroup_mkdir_ctrl_mon()
rdtgroup_init_alloc()
resctrl_arch_update_domains()
resctrl_arch_update_domains() iterates and updates all resctrl_conf_type
whose have_new_ctrl is true. Since staged_config[] holds the same values as
when CDP was enabled, it will continue to update the CDP_CODE and CDP_DATA
configurations. When group p8 is created, get_config_index() called in
resctrl_arch_update_domains() will return 16 and 17 as the CLOSIDs for
CDP_CODE and CDP_DATA, which will be translated to an invalid register -
0xca0 in this scenario.
Fix it by clearing staged_config[] before and after it is used.
Fixes: 75408e43509e ("x86/resctrl: Allow different CODE/DATA configurations to be staged")
Cc: <[email protected]>
Signed-off-by: Shawn Wang <[email protected]>
Suggested-by: Xin Hao <[email protected]>
---
Changes since v3:
- Update the commit message suggested by Reiniette Chatre.
- Rename resctrl_staged_configs_clear() to rdt_staged_configs_clear().
- Move rdt_staged_configs_clear() to arch/x86/kernel/cpu/resctrl/internal.h.
Changes since v2:
- Update the commit message suggested by Reiniette Chatre.
- Make the clearing work more robust.
Changes since v1:
- Move the clearing from schemata_list_destroy() to resctrl_arch_update_domains().
- Update the commit message suggested by Reiniette Chatre.
- Add stable tag suggested by James Morse.
---
arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 7 ++-----
arch/x86/kernel/cpu/resctrl/internal.h | 1 +
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 25 +++++++++++++++++++----
3 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1dafbdc5ac31..84f23327caed 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -374,7 +374,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
{
struct resctrl_schema *s;
struct rdtgroup *rdtgrp;
- struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
int ret = 0;
@@ -403,10 +402,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- list_for_each_entry(s, &resctrl_schema_all, list) {
- list_for_each_entry(dom, &s->res->domains, list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
+ rdt_staged_configs_clear();
while ((tok = strsep(&buf, "\n")) != NULL) {
resname = strim(strsep(&tok, ":"));
@@ -451,6 +447,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
}
out:
+ rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
cpus_read_unlock();
return ret ?: nbytes;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5f7128686cfd..0b5c6c76f6f7 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -537,5 +537,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
+void rdt_staged_configs_clear(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index e5a48f05e787..fee8ed86b31c 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...)
va_end(ap);
}
+void rdt_staged_configs_clear(void)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *dom;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for_each_alloc_capable_rdt_resource(r) {
+ list_for_each_entry(dom, &r->domains, list)
+ memset(dom->staged_config, 0, sizeof(dom->staged_config));
+ }
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -2841,7 +2854,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
{
struct resctrl_schema *s;
struct rdt_resource *r;
- int ret;
+ int ret = 0;
+
+ rdt_staged_configs_clear();
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
@@ -2852,20 +2867,22 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
} else {
ret = rdtgroup_init_cat(s, rdtgrp->closid);
if (ret < 0)
- return ret;
+ goto out;
}
ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Failed to initialize allocations\n");
- return ret;
+ goto out;
}
}
rdtgrp->mode = RDT_MODE_SHAREABLE;
- return 0;
+out:
+ rdt_staged_configs_clear();
+ return ret;
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
--
2.27.0
Hi, Shawn,
> As a temporary storage, staged_config[] in rdt_domain should be cleared before
> and after it is used. The stale value in staged_config[] could cause an MSR
> access error.
Why should staged_config[] be cleared both before and after it's used?
If it's cleared only before it's used, does it make more sense?
Usually temp variables are initialized before they are used and their values will
be ignored after usage. Especially clearing stage_config[] in double for
loop is pretty heavy code and an extra clearing stage_config[] after usage
takes unnecessary longer time.
> access error.
>
> Here is a reproducer on a system with 16 usable CLOSIDs for a 15-way L3 Cache
> (MBA should be disabled if the number of CLOSIDs for MB is less than
> 16.) :
> mount -t resctrl resctrl -o cdp /sys/fs/resctrl
> mkdir /sys/fs/resctrl/p{1..7}
> umount /sys/fs/resctrl/
> mount -t resctrl resctrl /sys/fs/resctrl
> mkdir /sys/fs/resctrl/p{1..8}
>
> An error occurs when creating resource group named p8:
> unchecked MSR access error: WRMSR to 0xca0 (tried to write
> 0x00000000000007ff) at rIP: 0xffffffff82249142 (cat_wrmsr+0x32/0x60)
> Call Trace:
> <IRQ>
> __flush_smp_call_function_queue+0x11d/0x170
> __sysvec_call_function+0x24/0xd0
> sysvec_call_function+0x89/0xc0
> </IRQ>
> <TASK>
> asm_sysvec_call_function+0x16/0x20
>
> When creating a new resource control group, hardware will be configured by
> the following process:
> rdtgroup_mkdir()
> rdtgroup_mkdir_ctrl_mon()
> rdtgroup_init_alloc()
> resctrl_arch_update_domains()
>
> resctrl_arch_update_domains() iterates and updates all resctrl_conf_type whose
> have_new_ctrl is true. Since staged_config[] holds the same values as when CDP
> was enabled, it will continue to update the CDP_CODE and CDP_DATA
> configurations. When group p8 is created, get_config_index() called in
> resctrl_arch_update_domains() will return 16 and 17 as the CLOSIDs for
> CDP_CODE and CDP_DATA, which will be translated to an invalid register -
> 0xca0 in this scenario.
>
> Fix it by clearing staged_config[] before and after it is used.
>
> Fixes: 75408e43509e ("x86/resctrl: Allow different CODE/DATA configurations
> to be staged")
> Cc: <[email protected]>
> Signed-off-by: Shawn Wang <[email protected]>
> Suggested-by: Xin Hao <[email protected]>
> ---
> Changes since v3:
> - Update the commit message suggested by Reiniette Chatre.
> - Rename resctrl_staged_configs_clear() to rdt_staged_configs_clear().
> - Move rdt_staged_configs_clear() to arch/x86/kernel/cpu/resctrl/internal.h.
>
> Changes since v2:
> - Update the commit message suggested by Reiniette Chatre.
> - Make the clearing work more robust.
>
> Changes since v1:
> - Move the clearing from schemata_list_destroy() to
> resctrl_arch_update_domains().
> - Update the commit message suggested by Reiniette Chatre.
> - Add stable tag suggested by James Morse.
> ---
> arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 7 ++-----
> arch/x86/kernel/cpu/resctrl/internal.h | 1 +
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 25 +++++++++++++++++++----
> 3 files changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> index 1dafbdc5ac31..84f23327caed 100644
> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> @@ -374,7 +374,6 @@ ssize_t rdtgroup_schemata_write(struct
> kernfs_open_file *of, {
> struct resctrl_schema *s;
> struct rdtgroup *rdtgrp;
> - struct rdt_domain *dom;
> struct rdt_resource *r;
> char *tok, *resname;
> int ret = 0;
> @@ -403,10 +402,7 @@ ssize_t rdtgroup_schemata_write(struct
> kernfs_open_file *of,
> goto out;
> }
>
> - list_for_each_entry(s, &resctrl_schema_all, list) {
> - list_for_each_entry(dom, &s->res->domains, list)
> - memset(dom->staged_config, 0, sizeof(dom-
> >staged_config));
> - }
> + rdt_staged_configs_clear();
>
> while ((tok = strsep(&buf, "\n")) != NULL) {
> resname = strim(strsep(&tok, ":"));
> @@ -451,6 +447,7 @@ ssize_t rdtgroup_schemata_write(struct
> kernfs_open_file *of,
> }
>
> out:
> + rdt_staged_configs_clear();
> rdtgroup_kn_unlock(of->kn);
> cpus_read_unlock();
> return ret ?: nbytes;
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h
> b/arch/x86/kernel/cpu/resctrl/internal.h
> index 5f7128686cfd..0b5c6c76f6f7 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -537,5 +537,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct
> rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free);
> void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init
> thread_throttle_mode_init(void);
> +void rdt_staged_configs_clear(void);
>
> #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git
> a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index e5a48f05e787..fee8ed86b31c 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...)
> va_end(ap);
> }
>
> +void rdt_staged_configs_clear(void)
> +{
> + struct rdt_resource *r;
> + struct rdt_domain *dom;
Please reorder the variable definitions in the reversed Christmas tree order.
> +
> + lockdep_assert_held(&rdtgroup_mutex);
> +
> + for_each_alloc_capable_rdt_resource(r) {
> + list_for_each_entry(dom, &r->domains, list)
> + memset(dom->staged_config, 0, sizeof(dom-
> >staged_config));
> + }
> +}
> +
> /*
> * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
> * we can keep a bitmap of free CLOSIDs in a single integer.
> @@ -2841,7 +2854,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
> {
> struct resctrl_schema *s;
> struct rdt_resource *r;
> - int ret;
> + int ret = 0;
> +
> + rdt_staged_configs_clear();
>
> list_for_each_entry(s, &resctrl_schema_all, list) {
> r = s->res;
> @@ -2852,20 +2867,22 @@ static int rdtgroup_init_alloc(struct rdtgroup
> *rdtgrp)
> } else {
> ret = rdtgroup_init_cat(s, rdtgrp->closid);
> if (ret < 0)
> - return ret;
> + goto out;
> }
>
> ret = resctrl_arch_update_domains(r, rdtgrp->closid);
> if (ret < 0) {
> rdt_last_cmd_puts("Failed to initialize allocations\n");
> - return ret;
> + goto out;
> }
>
> }
>
> rdtgrp->mode = RDT_MODE_SHAREABLE;
>
> - return 0;
> +out:
> + rdt_staged_configs_clear();
> + return ret;
> }
>
> static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> --
> 2.27.0
Thanks.
-Fenghua
Hi Fenghua,
On 12/9/2022 6:18 PM, Yu, Fenghua wrote:
> Hi, Shawn,
>
>> As a temporary storage, staged_config[] in rdt_domain should be cleared before
>> and after it is used. The stale value in staged_config[] could cause an MSR
>> access error.
>
> Why should staged_config[] be cleared both before and after it's used?
> If it's cleared only before it's used, does it make more sense?
This is something we discussed during v2 of this fix.
> Usually temp variables are initialized before they are used and their values will
> be ignored after usage. Especially clearing stage_config[] in double for
Ideally temporary variables have usage that matches their lifetime - variable is
created, used and then freed. The staged_config[] array is different in that it
has temporary usage but its lifetime matches that of the domain.
I agree that temporary variables should be initialized before they are used, but I
do prefer that we do not leave stale data around, especially since stale data was
the cause of the bug needing to be fixed with this patch.
> loop is pretty heavy code and an extra clearing stage_config[] after usage
> takes unnecessary longer time.
Could you please elaborate on this being heavy code? The clearing does not involve
interacting with the registers, it just clears the 3 entry array, one array per
domain. This is definitely not in a hot path since it is done when the user
initiates reconfiguration, either by writing to the schemata file or creating a new
resctrl directory. I thus do not see harm in doing the clearing after configuration
to ensure that no stale data is left around. I would like to better understand your
concern.
...
>> a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> index e5a48f05e787..fee8ed86b31c 100644
>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> @@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...)
>> va_end(ap);
>> }
>>
>> +void rdt_staged_configs_clear(void)
>> +{
>> + struct rdt_resource *r;
>> + struct rdt_domain *dom;
>
> Please reorder the variable definitions in the reversed Christmas tree order.
Could you please provide more detail on how you would like this to look?
Since the lines have equal length they pass the initial ordering requirement
so there must be some finer grained requirement for lines of equal length?
Reinette
Hi Shawn,
On 12/8/2022 6:44 PM, Shawn Wang wrote:
> As a temporary storage, staged_config[] in rdt_domain should be cleared
> before and after it is used. The stale value in staged_config[] could
> cause an MSR access error.
>
> Here is a reproducer on a system with 16 usable CLOSIDs for a 15-way L3
> Cache (MBA should be disabled if the number of CLOSIDs for MB is less than
> 16.) :
> mount -t resctrl resctrl -o cdp /sys/fs/resctrl
> mkdir /sys/fs/resctrl/p{1..7}
> umount /sys/fs/resctrl/
> mount -t resctrl resctrl /sys/fs/resctrl
> mkdir /sys/fs/resctrl/p{1..8}
>
> An error occurs when creating resource group named p8:
> unchecked MSR access error: WRMSR to 0xca0 (tried to write 0x00000000000007ff) at rIP: 0xffffffff82249142 (cat_wrmsr+0x32/0x60)
> Call Trace:
> <IRQ>
> __flush_smp_call_function_queue+0x11d/0x170
> __sysvec_call_function+0x24/0xd0
> sysvec_call_function+0x89/0xc0
> </IRQ>
> <TASK>
> asm_sysvec_call_function+0x16/0x20
>
> When creating a new resource control group, hardware will be configured
> by the following process:
> rdtgroup_mkdir()
> rdtgroup_mkdir_ctrl_mon()
> rdtgroup_init_alloc()
> resctrl_arch_update_domains()
>
> resctrl_arch_update_domains() iterates and updates all resctrl_conf_type
> whose have_new_ctrl is true. Since staged_config[] holds the same values as
> when CDP was enabled, it will continue to update the CDP_CODE and CDP_DATA
> configurations. When group p8 is created, get_config_index() called in
> resctrl_arch_update_domains() will return 16 and 17 as the CLOSIDs for
> CDP_CODE and CDP_DATA, which will be translated to an invalid register -
> 0xca0 in this scenario.
>
> Fix it by clearing staged_config[] before and after it is used.
>
> Fixes: 75408e43509e ("x86/resctrl: Allow different CODE/DATA configurations to be staged")
> Cc: <[email protected]>
> Signed-off-by: Shawn Wang <[email protected]>
> Suggested-by: Xin Hao <[email protected]>
> ---
Thank you very much.
Tested-by: Reinette Chatre <[email protected]>
Reviewed-by: Reinette Chatre <[email protected]>
Reinette