The scheduler reads the IPCC scores when balancing load. These reads can
occur frequently and originate from many CPUs. Hardware may also
occasionally update the HFI table. Controlling access with locks would
cause contention.
Cache the IPCC scores in separate per-CPU variables that the scheduler can
use. Use a seqcount to synchronize memory accesses to these cached values.
This eliminates the need for locks, as the sequence counter provides the
memory ordering required to prevent the use of stale data.
The HFI delayed workqueue guarantees that only one CPU writes the cached
IPCC scores. The frequency of updates is low (every CONFIG_HZ jiffies or
less), and the number of writes per update is in the order of tens. Writes
should not starve reads.
Only cache IPCC scores in this changeset. A subsequent changeset will
use these scores.
Cc: Ben Segall <[email protected]>
Cc: Daniel Bristot de Oliveira <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Ionela Voinescu <[email protected]>
Cc: Joel Fernandes (Google) <[email protected]>
Cc: Len Brown <[email protected]>
Cc: Lukasz Luba <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Perry Yuan <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Srinivas Pandruvada <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim C. Chen <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Zhao Liu <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ricardo Neri <[email protected]>
---
Changes since v3:
* As Rafael requested, I reworked the memory ordering of the cached IPCC
scores. I selected a seqcount as is less expensive than a memory
barrier, which is not necessary anyways.
* Made alloc_hfi_ipcc_scores() return -ENOMEM on allocation failure.
(Rafael)
* Added a comment to describe hfi_ipcc_scores. (Rafael)
Changes since v2:
* Only create these per-CPU variables when Intel Thread Director is
supported.
Changes since v1:
* Added this patch.
---
drivers/thermal/intel/intel_hfi.c | 66 +++++++++++++++++++++++++++++++
1 file changed, 66 insertions(+)
diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
index 20ee4264dcd4..d822ed0bb5c1 100644
--- a/drivers/thermal/intel/intel_hfi.c
+++ b/drivers/thermal/intel/intel_hfi.c
@@ -29,9 +29,11 @@
#include <linux/kernel.h>
#include <linux/math.h>
#include <linux/mutex.h>
+#include <linux/percpu.h>
#include <linux/percpu-defs.h>
#include <linux/printk.h>
#include <linux/processor.h>
+#include <linux/seqlock.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
@@ -180,6 +182,62 @@ static struct workqueue_struct *hfi_updates_wq;
#define HFI_UPDATE_INTERVAL HZ
#define HFI_MAX_THERM_NOTIFY_COUNT 16
+/* A cache of the HFI perf capabilities for lockless access. */
+static int __percpu *hfi_ipcc_scores;
+/* Sequence counter for hfi_ipcc_scores */
+static seqcount_t hfi_ipcc_seqcount = SEQCNT_ZERO(hfi_ipcc_seqcount);
+
+static int alloc_hfi_ipcc_scores(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_ITD))
+ return 0;
+
+ hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) *
+ hfi_features.nr_classes,
+ sizeof(*hfi_ipcc_scores));
+
+ return hfi_ipcc_scores ? 0 : -ENOMEM;
+}
+
+static void set_hfi_ipcc_scores(struct hfi_instance *hfi_instance)
+{
+ int cpu;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ITD))
+ return;
+
+ /*
+ * Serialize with writes to the HFI table. It also protects the write
+ * loop against seqcount readers running in interrupt context.
+ */
+ raw_spin_lock_irq(&hfi_instance->table_lock);
+ /*
+ * The seqcount implies store-release semantics to order stores with
+ * lockless loads from the seqcount read side. It also implies a
+ * compiler barrier.
+ */
+ write_seqcount_begin(&hfi_ipcc_seqcount);
+ for_each_cpu(cpu, hfi_instance->cpus) {
+ int c, *scores;
+ s16 index;
+
+ index = per_cpu(hfi_cpu_info, cpu).index;
+ scores = per_cpu_ptr(hfi_ipcc_scores, cpu);
+
+ for (c = 0; c < hfi_features.nr_classes; c++) {
+ struct hfi_cpu_data *caps;
+
+ caps = hfi_instance->data +
+ index * hfi_features.cpu_stride +
+ c * hfi_features.class_stride;
+ scores[c] = caps->perf_cap;
+ }
+ }
+
+ write_seqcount_end(&hfi_ipcc_seqcount);
+ raw_spin_unlock_irq(&hfi_instance->table_lock);
+}
+
/**
* intel_hfi_read_classid() - Read the currrent classid
* @classid: Variable to which the classid will be written.
@@ -275,6 +333,8 @@ static void update_capabilities(struct hfi_instance *hfi_instance)
thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]);
kfree(cpu_caps);
+
+ set_hfi_ipcc_scores(hfi_instance);
out:
mutex_unlock(&hfi_instance_lock);
}
@@ -618,8 +678,14 @@ void __init intel_hfi_init(void)
if (!hfi_updates_wq)
goto err_nomem;
+ if (alloc_hfi_ipcc_scores())
+ goto err_ipcc;
+
return;
+err_ipcc:
+ destroy_workqueue(hfi_updates_wq);
+
err_nomem:
for (j = 0; j < i; ++j) {
hfi_instance = &hfi_instances[j];
--
2.25.1
On Tue, Jun 13, 2023 at 6:23 AM Ricardo Neri
<[email protected]> wrote:
>
> The scheduler reads the IPCC scores when balancing load. These reads can
> occur frequently and originate from many CPUs. Hardware may also
> occasionally update the HFI table. Controlling access with locks would
> cause contention.
>
> Cache the IPCC scores in separate per-CPU variables that the scheduler can
> use. Use a seqcount to synchronize memory accesses to these cached values.
> This eliminates the need for locks, as the sequence counter provides the
> memory ordering required to prevent the use of stale data.
>
> The HFI delayed workqueue guarantees that only one CPU writes the cached
> IPCC scores. The frequency of updates is low (every CONFIG_HZ jiffies or
> less), and the number of writes per update is in the order of tens. Writes
> should not starve reads.
>
> Only cache IPCC scores in this changeset. A subsequent changeset will
> use these scores.
>
> Cc: Ben Segall <[email protected]>
> Cc: Daniel Bristot de Oliveira <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Ionela Voinescu <[email protected]>
> Cc: Joel Fernandes (Google) <[email protected]>
> Cc: Len Brown <[email protected]>
> Cc: Lukasz Luba <[email protected]>
> Cc: Mel Gorman <[email protected]>
> Cc: Perry Yuan <[email protected]>
> Cc: Rafael J. Wysocki <[email protected]>
> Cc: Srinivas Pandruvada <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Cc: Tim C. Chen <[email protected]>
> Cc: Valentin Schneider <[email protected]>
> Cc: Zhao Liu <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Ricardo Neri <[email protected]>
> ---
> Changes since v3:
> * As Rafael requested, I reworked the memory ordering of the cached IPCC
> scores. I selected a seqcount as is less expensive than a memory
> barrier, which is not necessary anyways.
> * Made alloc_hfi_ipcc_scores() return -ENOMEM on allocation failure.
> (Rafael)
> * Added a comment to describe hfi_ipcc_scores. (Rafael)
>
> Changes since v2:
> * Only create these per-CPU variables when Intel Thread Director is
> supported.
>
> Changes since v1:
> * Added this patch.
> ---
> drivers/thermal/intel/intel_hfi.c | 66 +++++++++++++++++++++++++++++++
> 1 file changed, 66 insertions(+)
>
> diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
> index 20ee4264dcd4..d822ed0bb5c1 100644
> --- a/drivers/thermal/intel/intel_hfi.c
> +++ b/drivers/thermal/intel/intel_hfi.c
> @@ -29,9 +29,11 @@
> #include <linux/kernel.h>
> #include <linux/math.h>
> #include <linux/mutex.h>
> +#include <linux/percpu.h>
> #include <linux/percpu-defs.h>
> #include <linux/printk.h>
> #include <linux/processor.h>
> +#include <linux/seqlock.h>
> #include <linux/slab.h>
> #include <linux/spinlock.h>
> #include <linux/string.h>
> @@ -180,6 +182,62 @@ static struct workqueue_struct *hfi_updates_wq;
> #define HFI_UPDATE_INTERVAL HZ
> #define HFI_MAX_THERM_NOTIFY_COUNT 16
>
> +/* A cache of the HFI perf capabilities for lockless access. */
> +static int __percpu *hfi_ipcc_scores;
> +/* Sequence counter for hfi_ipcc_scores */
> +static seqcount_t hfi_ipcc_seqcount = SEQCNT_ZERO(hfi_ipcc_seqcount);
> +
> +static int alloc_hfi_ipcc_scores(void)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_ITD))
> + return 0;
> +
> + hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) *
> + hfi_features.nr_classes,
> + sizeof(*hfi_ipcc_scores));
> +
> + return hfi_ipcc_scores ? 0 : -ENOMEM;
> +}
This doesn't need to return an int. It could be a bool function
returning !!hfi_ipcc_scores (or false for the feature missing case).
Apart from this minor thing, the patch looks reasonable to me.
> +
> +static void set_hfi_ipcc_scores(struct hfi_instance *hfi_instance)
> +{
> + int cpu;
> +
> + if (!cpu_feature_enabled(X86_FEATURE_ITD))
> + return;
> +
> + /*
> + * Serialize with writes to the HFI table. It also protects the write
> + * loop against seqcount readers running in interrupt context.
> + */
> + raw_spin_lock_irq(&hfi_instance->table_lock);
> + /*
> + * The seqcount implies store-release semantics to order stores with
> + * lockless loads from the seqcount read side. It also implies a
> + * compiler barrier.
> + */
> + write_seqcount_begin(&hfi_ipcc_seqcount);
> + for_each_cpu(cpu, hfi_instance->cpus) {
> + int c, *scores;
> + s16 index;
> +
> + index = per_cpu(hfi_cpu_info, cpu).index;
> + scores = per_cpu_ptr(hfi_ipcc_scores, cpu);
> +
> + for (c = 0; c < hfi_features.nr_classes; c++) {
> + struct hfi_cpu_data *caps;
> +
> + caps = hfi_instance->data +
> + index * hfi_features.cpu_stride +
> + c * hfi_features.class_stride;
> + scores[c] = caps->perf_cap;
> + }
> + }
> +
> + write_seqcount_end(&hfi_ipcc_seqcount);
> + raw_spin_unlock_irq(&hfi_instance->table_lock);
> +}
> +
> /**
> * intel_hfi_read_classid() - Read the currrent classid
> * @classid: Variable to which the classid will be written.
> @@ -275,6 +333,8 @@ static void update_capabilities(struct hfi_instance *hfi_instance)
> thermal_genl_cpu_capability_event(cpu_count, &cpu_caps[i]);
>
> kfree(cpu_caps);
> +
> + set_hfi_ipcc_scores(hfi_instance);
> out:
> mutex_unlock(&hfi_instance_lock);
> }
> @@ -618,8 +678,14 @@ void __init intel_hfi_init(void)
> if (!hfi_updates_wq)
> goto err_nomem;
>
> + if (alloc_hfi_ipcc_scores())
> + goto err_ipcc;
> +
> return;
>
> +err_ipcc:
> + destroy_workqueue(hfi_updates_wq);
> +
> err_nomem:
> for (j = 0; j < i; ++j) {
> hfi_instance = &hfi_instances[j];
> --
> 2.25.1
>
On Thu, Jun 29, 2023 at 08:53:31PM +0200, Rafael J. Wysocki wrote:
> On Tue, Jun 13, 2023 at 6:23 AM Ricardo Neri
> <[email protected]> wrote:
> >
> > The scheduler reads the IPCC scores when balancing load. These reads can
> > occur frequently and originate from many CPUs. Hardware may also
> > occasionally update the HFI table. Controlling access with locks would
> > cause contention.
> >
> > Cache the IPCC scores in separate per-CPU variables that the scheduler can
> > use. Use a seqcount to synchronize memory accesses to these cached values.
> > This eliminates the need for locks, as the sequence counter provides the
> > memory ordering required to prevent the use of stale data.
> >
> > The HFI delayed workqueue guarantees that only one CPU writes the cached
> > IPCC scores. The frequency of updates is low (every CONFIG_HZ jiffies or
> > less), and the number of writes per update is in the order of tens. Writes
> > should not starve reads.
> >
> > Only cache IPCC scores in this changeset. A subsequent changeset will
> > use these scores.
> >
> > Cc: Ben Segall <[email protected]>
> > Cc: Daniel Bristot de Oliveira <[email protected]>
> > Cc: Dietmar Eggemann <[email protected]>
> > Cc: Ionela Voinescu <[email protected]>
> > Cc: Joel Fernandes (Google) <[email protected]>
> > Cc: Len Brown <[email protected]>
> > Cc: Lukasz Luba <[email protected]>
> > Cc: Mel Gorman <[email protected]>
> > Cc: Perry Yuan <[email protected]>
> > Cc: Rafael J. Wysocki <[email protected]>
> > Cc: Srinivas Pandruvada <[email protected]>
> > Cc: Steven Rostedt <[email protected]>
> > Cc: Tim C. Chen <[email protected]>
> > Cc: Valentin Schneider <[email protected]>
> > Cc: Zhao Liu <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Cc: [email protected]
> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Ricardo Neri <[email protected]>
> > ---
> > Changes since v3:
> > * As Rafael requested, I reworked the memory ordering of the cached IPCC
> > scores. I selected a seqcount as is less expensive than a memory
> > barrier, which is not necessary anyways.
> > * Made alloc_hfi_ipcc_scores() return -ENOMEM on allocation failure.
> > (Rafael)
> > * Added a comment to describe hfi_ipcc_scores. (Rafael)
> >
> > Changes since v2:
> > * Only create these per-CPU variables when Intel Thread Director is
> > supported.
> >
> > Changes since v1:
> > * Added this patch.
> > ---
> > drivers/thermal/intel/intel_hfi.c | 66 +++++++++++++++++++++++++++++++
> > 1 file changed, 66 insertions(+)
> >
> > diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
> > index 20ee4264dcd4..d822ed0bb5c1 100644
> > --- a/drivers/thermal/intel/intel_hfi.c
> > +++ b/drivers/thermal/intel/intel_hfi.c
> > @@ -29,9 +29,11 @@
> > #include <linux/kernel.h>
> > #include <linux/math.h>
> > #include <linux/mutex.h>
> > +#include <linux/percpu.h>
> > #include <linux/percpu-defs.h>
> > #include <linux/printk.h>
> > #include <linux/processor.h>
> > +#include <linux/seqlock.h>
> > #include <linux/slab.h>
> > #include <linux/spinlock.h>
> > #include <linux/string.h>
> > @@ -180,6 +182,62 @@ static struct workqueue_struct *hfi_updates_wq;
> > #define HFI_UPDATE_INTERVAL HZ
> > #define HFI_MAX_THERM_NOTIFY_COUNT 16
> >
> > +/* A cache of the HFI perf capabilities for lockless access. */
> > +static int __percpu *hfi_ipcc_scores;
> > +/* Sequence counter for hfi_ipcc_scores */
> > +static seqcount_t hfi_ipcc_seqcount = SEQCNT_ZERO(hfi_ipcc_seqcount);
> > +
> > +static int alloc_hfi_ipcc_scores(void)
> > +{
> > + if (!cpu_feature_enabled(X86_FEATURE_ITD))
> > + return 0;
> > +
> > + hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) *
> > + hfi_features.nr_classes,
> > + sizeof(*hfi_ipcc_scores));
> > +
> > + return hfi_ipcc_scores ? 0 : -ENOMEM;
> > +}
>
> This doesn't need to return an int. It could be a bool function
> returning !!hfi_ipcc_scores (or false for the feature missing case).
Sure Rafael, I can make this change.
>
> Apart from this minor thing, the patch looks reasonable to me.
Thank you for your review!