2022-02-17 18:37:07

by James Morse

[permalink] [raw]
Subject: [PATCH v3 19/21] x86/resctrl: Rename and change the units of resctrl_cqm_threshold

resctrl_cqm_threshold is stored in a hardware specific chunk size,
but exposed to user-space as bytes.

This means the filesystem parts of resctrl need to know how the hardware
counts, to convert the user provided byte value to chunks. The interface
between the architecture's resctrl code and the filesystem ought to
treat everything as bytes.

Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
still returns its value in chunks, so this needs converting to bytes.
As all the callers have been touched, rename the variable to
resctrl_rmid_realloc_threshold, which describes what the value is for.

Signed-off-by: James Morse <[email protected]>
---
arch/x86/kernel/cpu/resctrl/internal.h | 1 -
arch/x86/kernel/cpu/resctrl/monitor.c | 34 ++++++++++++--------------
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 9 ++-----
include/linux/resctrl.h | 2 ++
4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c8d52fbee8cd..e26a4d67e204 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -98,7 +98,6 @@ struct rmid_read {
u64 val;
};

-extern unsigned int resctrl_cqm_threshold;
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e89ae648046b..e91d9a7024a2 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -37,8 +37,8 @@ static LIST_HEAD(rmid_free_lru);
* @rmid_limbo_count count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
- * may have a occupancy value > intel_cqm_threshold. User can change
- * the threshold occupancy value.
+ * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ * change the threshold occupancy value.
*/
static unsigned int rmid_limbo_count;

@@ -59,10 +59,10 @@ bool rdt_mon_capable;
unsigned int rdt_mon_features;

/*
- * This is the threshold cache occupancy at which we will consider an
+ * This is the threshold cache occupancy in bytes at which we will consider an
* RMID available for re-allocation.
*/
-unsigned int resctrl_cqm_threshold;
+unsigned int resctrl_rmid_realloc_threshold;

#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))

@@ -223,14 +223,13 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
*/
void __check_limbo(struct rdt_domain *d, bool force_free)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rmid_entry *entry;
- struct rdt_resource *r;
u32 crmid = 1, nrmid;
bool rmid_dirty;
u64 val = 0;

- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
* are marked as busy for occupancy < threshold. If the occupancy
@@ -245,10 +244,12 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
entry = __rmid_entry(nrmid);

if (resctrl_arch_rmid_read(r, d, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID, &val))
+ QOS_L3_OCCUP_EVENT_ID, &val)) {
rmid_dirty = true;
- else
- rmid_dirty = (val >= resctrl_cqm_threshold);
+ } else {
+ val *= hw_res->mon_scale;
+ rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+ }

if (force_free || !rmid_dirty) {
clear_bit(entry->rmid, d->rmid_busy_llc);
@@ -289,13 +290,12 @@ int alloc_rmid(void)

static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r;
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_domain *d;
int cpu, err;
u64 val = 0;

- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
entry->busy = 0;
cpu = get_cpu();
list_for_each_entry(d, &r->domains, list) {
@@ -303,7 +303,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
err = resctrl_arch_rmid_read(r, d, entry->rmid,
QOS_L3_OCCUP_EVENT_ID,
&val);
- if (err || val <= resctrl_cqm_threshold)
+ val *= hw_res->mon_scale;
+ if (err || val <= resctrl_rmid_realloc_threshold)
continue;
}

@@ -762,10 +763,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
- resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
-
- /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
- resctrl_cqm_threshold /= hw_res->mon_scale;
+ resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;

ret = dom_data_init(r);
if (ret)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 7ec089d72ab7..93b3697027df 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
static int max_threshold_occ_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
- seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
+ seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);

return 0;
}
@@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct rdt_hw_resource *hw_res;
unsigned int bytes;
int ret;

@@ -1066,8 +1062,7 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
if (bytes > (boot_cpu_data.x86_cache_size * 1024))
return -EINVAL;

- hw_res = resctrl_to_arch_res(of->kn->parent->priv);
- resctrl_cqm_threshold = bytes / hw_res->mon_scale;
+ resctrl_rmid_realloc_threshold = bytes;

return nbytes;
}
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 5d57e2610c79..c79a180e578c 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -248,4 +248,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
u32 rmid, enum resctrl_event_id eventid);

+extern unsigned int resctrl_rmid_realloc_threshold;
+
#endif /* _RESCTRL_H */
--
2.30.2


2022-03-17 20:33:02

by Reinette Chatre

[permalink] [raw]
Subject: Re: [PATCH v3 19/21] x86/resctrl: Rename and change the units of resctrl_cqm_threshold

Hi James,

On 2/17/2022 10:21 AM, James Morse wrote:
> resctrl_cqm_threshold is stored in a hardware specific chunk size,
> but exposed to user-space as bytes.
>
> This means the filesystem parts of resctrl need to know how the hardware
> counts, to convert the user provided byte value to chunks. The interface
> between the architecture's resctrl code and the filesystem ought to
> treat everything as bytes.
>
> Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
> still returns its value in chunks, so this needs converting to bytes.
> As all the callers have been touched, rename the variable to
> resctrl_rmid_realloc_threshold, which describes what the value is for.
>
> Signed-off-by: James Morse <[email protected]>

...


> @@ -762,10 +763,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
> *
> * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> */
> - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
> -
> - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
> - resctrl_cqm_threshold /= hw_res->mon_scale;
> + resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;
>
> ret = dom_data_init(r);
> if (ret)
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 7ec089d72ab7..93b3697027df 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
> static int max_threshold_occ_show(struct kernfs_open_file *of,
> struct seq_file *seq, void *v)
> {
> - struct rdt_resource *r = of->kn->parent->priv;
> - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> -
> - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
> + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
>
> return 0;
> }


This change has some user visible impact that I am still digesting but thought
that I would share for your consideration.

As seen in the above two snippets, the original code did:

resctrl_cqm_threshold /= hw_res->mon_scale; /* resctrl_cqm_threshold used internally */

resctrl_cqm_threshold * hw_res->mon_scale; /* this is displayed to user */

The original loss due to truncation during the division is not recovered
when the value is displayed to the user the user may see significant differences
before and after this patch.

I tried this out on a system with a large cache and the before and after
information is significant:
Before this patch:
info/L3_MON/max_threshold_occupancy:147456

After this patch:
info/L3_MON/max_threshold_occupancy:196608

As I understand this change indeed represents the information more accurately but
I found it noteworthy that this is not just a simple "change the units" and
may thus have broader impact and may indeed result in different behavior that
should be considered.

Reinette

2022-03-31 04:43:27

by James Morse

[permalink] [raw]
Subject: Re: [PATCH v3 19/21] x86/resctrl: Rename and change the units of resctrl_cqm_threshold

Hi Reinette,

On 17/03/2022 17:00, Reinette Chatre wrote:
> On 2/17/2022 10:21 AM, James Morse wrote:
>> resctrl_cqm_threshold is stored in a hardware specific chunk size,
>> but exposed to user-space as bytes.
>>
>> This means the filesystem parts of resctrl need to know how the hardware
>> counts, to convert the user provided byte value to chunks. The interface
>> between the architecture's resctrl code and the filesystem ought to
>> treat everything as bytes.
>>
>> Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
>> still returns its value in chunks, so this needs converting to bytes.
>> As all the callers have been touched, rename the variable to
>> resctrl_rmid_realloc_threshold, which describes what the value is for.

>> @@ -762,10 +763,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
>> *
>> * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
>> */
>> - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
>> -
>> - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
>> - resctrl_cqm_threshold /= hw_res->mon_scale;
>> + resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;
>>
>> ret = dom_data_init(r);
>> if (ret)
>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> index 7ec089d72ab7..93b3697027df 100644
>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
>> static int max_threshold_occ_show(struct kernfs_open_file *of,
>> struct seq_file *seq, void *v)
>> {
>> - struct rdt_resource *r = of->kn->parent->priv;
>> - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>> -
>> - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
>> + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
>>
>> return 0;
>> }
>
>
> This change has some user visible impact that I am still digesting but thought
> that I would share for your consideration.
>
> As seen in the above two snippets, the original code did:
>
> resctrl_cqm_threshold /= hw_res->mon_scale; /* resctrl_cqm_threshold used internally */
>
> resctrl_cqm_threshold * hw_res->mon_scale; /* this is displayed to user */
>
> The original loss due to truncation during the division is not recovered
> when the value is displayed to the user the user may see significant differences
> before and after this patch.
>
> I tried this out on a system with a large cache and the before and after
> information is significant:
> Before this patch:
> info/L3_MON/max_threshold_occupancy:147456
>
> After this patch:
> info/L3_MON/max_threshold_occupancy:196608

Hmm. I hadn't considered that information would be lost by the current way of doing this.
It looks like this happens because num_rmid isn't necessarily a power of 2.


> As I understand this change indeed represents the information more accurately but
> I found it noteworthy that this is not just a simple "change the units" and
> may thus have broader impact and may indeed result in different behavior that
> should be considered.

I agree it more accurately reflects resctrl's calculation of "the number
of lines tagged per RMID if all RMIDs have the same number of lines", but if that
produces a number the hardware will never actually measure, then the rounding is still
happening, but somewhere else.

I think the right thing to do is round resctrl_rmid_realloc_threshold down to the nearest
multiple of hw_res->mon_scale in rdt_get_mon_l3_config(). This way the filesystem parts
still handle things in bytes, and the architecture code provides the quantised value that
will actually get measured. Its this value that should be reported to user-space.

It doesn't look like the 'Upscaling Factor' is guaranteed to be a power of 2, so I can't
use the round_down() helpers.

I've added this to the commit message:
| Neither r->num_rmid nor hw_res->mon_scale are guaranteed to be a power
| of 2, so the existing code introduces a rounding error from resctrl's
| theoretical fraction of the cache usage. This behaviour is kept as it
| ensures the user visible value matches the value read from hardware
| when the rmid will be reallocated.

and the hunk below, which fixes it for me.



Thanks,

James

---------------%<---------------
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index b18e227d585c..fb81d650c457 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -753,6 +753,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
unsigned int cl_size = boot_cpu_data.x86_cache_size;
+ u64 threshold;
int ret;

hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
@@ -771,7 +772,15 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
- resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;
+ threshold = cl_size * 1024 / r->num_rmid;
+
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ threshold /= hw_res->mon_scale;
+ resctrl_rmid_realloc_threshold = threshold * hw_res->mon_scale;

ret = dom_data_init(r);
if (ret)
---------------%<---------------

2022-04-04 03:46:45

by Reinette Chatre

[permalink] [raw]
Subject: Re: [PATCH v3 19/21] x86/resctrl: Rename and change the units of resctrl_cqm_threshold

Hi James,

On 3/30/2022 9:45 AM, James Morse wrote:
> Hi Reinette,
>
> On 17/03/2022 17:00, Reinette Chatre wrote:
>> On 2/17/2022 10:21 AM, James Morse wrote:
>>> resctrl_cqm_threshold is stored in a hardware specific chunk size,
>>> but exposed to user-space as bytes.
>>>
>>> This means the filesystem parts of resctrl need to know how the hardware
>>> counts, to convert the user provided byte value to chunks. The interface
>>> between the architecture's resctrl code and the filesystem ought to
>>> treat everything as bytes.
>>>
>>> Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
>>> still returns its value in chunks, so this needs converting to bytes.
>>> As all the callers have been touched, rename the variable to
>>> resctrl_rmid_realloc_threshold, which describes what the value is for.
>
>>> @@ -762,10 +763,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
>>> *
>>> * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
>>> */
>>> - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
>>> -
>>> - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
>>> - resctrl_cqm_threshold /= hw_res->mon_scale;
>>> + resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;
>>>
>>> ret = dom_data_init(r);
>>> if (ret)
>>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> index 7ec089d72ab7..93b3697027df 100644
>>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
>>> static int max_threshold_occ_show(struct kernfs_open_file *of,
>>> struct seq_file *seq, void *v)
>>> {
>>> - struct rdt_resource *r = of->kn->parent->priv;
>>> - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>>> -
>>> - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
>>> + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
>>>
>>> return 0;
>>> }
>>
>>
>> This change has some user visible impact that I am still digesting but thought
>> that I would share for your consideration.
>>
>> As seen in the above two snippets, the original code did:
>>
>> resctrl_cqm_threshold /= hw_res->mon_scale; /* resctrl_cqm_threshold used internally */
>>
>> resctrl_cqm_threshold * hw_res->mon_scale; /* this is displayed to user */
>>
>> The original loss due to truncation during the division is not recovered
>> when the value is displayed to the user the user may see significant differences
>> before and after this patch.
>>
>> I tried this out on a system with a large cache and the before and after
>> information is significant:
>> Before this patch:
>> info/L3_MON/max_threshold_occupancy:147456
>>
>> After this patch:
>> info/L3_MON/max_threshold_occupancy:196608
>
> Hmm. I hadn't considered that information would be lost by the current way of doing this.
> It looks like this happens because num_rmid isn't necessarily a power of 2.
>
>
>> As I understand this change indeed represents the information more accurately but
>> I found it noteworthy that this is not just a simple "change the units" and
>> may thus have broader impact and may indeed result in different behavior that
>> should be considered.
>
> I agree it more accurately reflects resctrl's calculation of "the number
> of lines tagged per RMID if all RMIDs have the same number of lines", but if that
> produces a number the hardware will never actually measure, then the rounding is still
> happening, but somewhere else.
>
> I think the right thing to do is round resctrl_rmid_realloc_threshold down to the nearest
> multiple of hw_res->mon_scale in rdt_get_mon_l3_config(). This way the filesystem parts
> still handle things in bytes, and the architecture code provides the quantised value that
> will actually get measured. Its this value that should be reported to user-space.
>
> It doesn't look like the 'Upscaling Factor' is guaranteed to be a power of 2, so I can't
> use the round_down() helpers.
>
> I've added this to the commit message:
> | Neither r->num_rmid nor hw_res->mon_scale are guaranteed to be a power
> | of 2, so the existing code introduces a rounding error from resctrl's
> | theoretical fraction of the cache usage. This behaviour is kept as it
> | ensures the user visible value matches the value read from hardware
> | when the rmid will be reallocated.
>
> and the hunk below, which fixes it for me.
>
>
>
> Thanks,
>
> James
>
> ---------------%<---------------
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index b18e227d585c..fb81d650c457 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -753,6 +753,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
> unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
> struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> unsigned int cl_size = boot_cpu_data.x86_cache_size;
> + u64 threshold;
> int ret;
>
> hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
> @@ -771,7 +772,15 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
> *
> * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
> */
> - resctrl_rmid_realloc_threshold = cl_size * 1024 / r->num_rmid;
> + threshold = cl_size * 1024 / r->num_rmid;
> +
> + /*
> + * Because num_rmid may not be a power of two, round the value
> + * to the nearest multiple of hw_res->mon_scale so it matches a
> + * value the hardware will measure. mon_scale may not be a power of 2.
> + */
> + threshold /= hw_res->mon_scale;
> + resctrl_rmid_realloc_threshold = threshold * hw_res->mon_scale;
>
> ret = dom_data_init(r);
> if (ret)
> ---------------%<---------------

Thank you for the added explanation. From what I can tell this also restores current
behavior.

Reinette