From: Vincent Donnefort <[email protected]>
Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
factorize the u64 vminruntime and last_update_time read on a 32-bits
architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
synchronization and therefore, have a small penalty in set_task_rq_fair()
and init_cfs_rq().
The choice of using a macro over an inline function is driven by the
conditional u64 variable copy declarations.
#ifndef CONFIG_64BIT
u64 [vminruntime|last_update_time]_copy;
#endif
Signed-off-by: Vincent Donnefort <[email protected]>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a05..00a825d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -563,10 +563,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
/* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+
+ u64_32read_set_copy(cfs_rq->min_vruntime, cfs_rq->min_vruntime_copy);
}
/*
@@ -3284,6 +3282,11 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
#ifdef CONFIG_SMP
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_32read(cfs_rq->avg.last_update_time,
+ cfs_rq->load_last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/**
* update_tg_load_avg - update the tg's load avg
@@ -3340,27 +3343,9 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
-
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3681,10 +3666,8 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
+ u64_32read_set_copy(sa->last_update_time,
+ cfs_rq->load_last_update_time_copy);
return decayed;
}
@@ -3810,27 +3793,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -6744,21 +6706,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
if (p->state == TASK_WAKING) {
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
- se->vruntime -= min_vruntime;
+ se->vruntime -= u64_32read(cfs_rq->min_vruntime,
+ cfs_rq->min_vruntime_copy);
}
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
@@ -10891,9 +10841,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_32read_set_copy(cfs_rq->min_vruntime, cfs_rq->min_vruntime_copy);
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9f33c77..c8c4646 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -605,6 +605,41 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
+/*
+ * u64_32read() / u64_32read_set_copy()
+ *
+ * Use the copied u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#if !defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+# define u64_32read(val, copy) \
+({ \
+ u64 _val; \
+ u64 _val_copy; \
+ \
+ do { \
+ _val_copy = copy; \
+ /* \
+ * paired with u64_32read_set_copy, ordering access \
+ * to val and copy. \
+ */ \
+ smp_rmb(); \
+ _val = val; \
+ } while (_val != _val_copy); \
+ \
+ _val; \
+})
+# define u64_32read_set_copy(val, copy) \
+do { \
+ /* paired with u64_32read, ordering access to val and copy */ \
+ smp_wmb(); \
+ copy = val; \
+} while (0)
+#else
+# define u64_32read(val, copy) (val)
+# define u64_32read_set_copy(val, copy) do { } while (0)
+#endif
+
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
--
2.7.4
* [email protected] <[email protected]> wrote:
> From: Vincent Donnefort <[email protected]>
>
> Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
> factorize the u64 vminruntime and last_update_time read on a 32-bits
> architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
> synchronization and therefore, have a small penalty in set_task_rq_fair()
> and init_cfs_rq().
>
> The choice of using a macro over an inline function is driven by the
> conditional u64 variable copy declarations.
Could you please explain how "conditional u64 variable copy
declarations" prevents us to use an inline function for this:
> +/*
> + * u64_32read() / u64_32read_set_copy()
> + *
> + * Use the copied u64 value to protect against data race. This is only
> + * applicable for 32-bits architectures.
> + */
> +#if !defined(CONFIG_64BIT) && defined(CONFIG_SMP)
> +# define u64_32read(val, copy) \
> +({ \
> + u64 _val; \
> + u64 _val_copy; \
> + \
> + do { \
> + _val_copy = copy; \
> + /* \
> + * paired with u64_32read_set_copy, ordering access \
> + * to val and copy. \
> + */ \
> + smp_rmb(); \
> + _val = val; \
> + } while (_val != _val_copy); \
> + \
> + _val; \
> +})
> +# define u64_32read_set_copy(val, copy) \
> +do { \
> + /* paired with u64_32read, ordering access to val and copy */ \
> + smp_wmb(); \
> + copy = val; \
> +} while (0)
> +#else
> +# define u64_32read(val, copy) (val)
> +# define u64_32read_set_copy(val, copy) do { } while (0)
> +#endif
> +
Thanks,
Ingo
Hi,
On Mon, Jul 27, 2020 at 01:24:54PM +0200, Ingo Molnar wrote:
>
> * [email protected] <[email protected]> wrote:
>
> > From: Vincent Donnefort <[email protected]>
> >
> > Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
> > factorize the u64 vminruntime and last_update_time read on a 32-bits
> > architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
> > synchronization and therefore, have a small penalty in set_task_rq_fair()
> > and init_cfs_rq().
> >
> > The choice of using a macro over an inline function is driven by the
> > conditional u64 variable copy declarations.
>
> Could you please explain how "conditional u64 variable copy
> declarations" prevents us to use an inline function for this
I meant, as the "copy" variable [vminruntime|last_update_time]_copy, is
declared only in the !CONFIG_64BIT case, a function call would fail when
CONFIG_64BIT is set.
u64_32read(cfs_rq->min_vruntime, cfs_rq->min_vruntime_copy);
^
not declared
I could rephrase this part to something more understandable ?
>
> > +/*
> > + * u64_32read() / u64_32read_set_copy()
> > + *
> > + * Use the copied u64 value to protect against data race. This is only
> > + * applicable for 32-bits architectures.
> > + */
> > +#if !defined(CONFIG_64BIT) && defined(CONFIG_SMP)
> > +# define u64_32read(val, copy) \
> > +({ \
> > + u64 _val; \
> > + u64 _val_copy; \
> > + \
> > + do { \
> > + _val_copy = copy; \
> > + /* \
> > + * paired with u64_32read_set_copy, ordering access \
> > + * to val and copy. \
> > + */ \
> > + smp_rmb(); \
> > + _val = val; \
> > + } while (_val != _val_copy); \
> > + \
> > + _val; \
> > +})
> > +# define u64_32read_set_copy(val, copy) \
> > +do { \
> > + /* paired with u64_32read, ordering access to val and copy */ \
> > + smp_wmb(); \
> > + copy = val; \
> > +} while (0)
> > +#else
> > +# define u64_32read(val, copy) (val)
> > +# define u64_32read_set_copy(val, copy) do { } while (0)
> > +#endif
> > +
>
> Thanks,
>
> Ingo
--
Vincent
On Mon, Jul 27, 2020 at 11:59:24AM +0100, [email protected] wrote:
> From: Vincent Donnefort <[email protected]>
>
> Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
> factorize the u64 vminruntime and last_update_time read on a 32-bits
> architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
> synchronization and therefore, have a small penalty in set_task_rq_fair()
> and init_cfs_rq().
>
> The choice of using a macro over an inline function is driven by the
> conditional u64 variable copy declarations.
>
> #ifndef CONFIG_64BIT
> u64 [vminruntime|last_update_time]_copy;
> #endif
This lacks a *why*... why did you get up this morning and wrote us this
patch.
On Mon, Jul 27, 2020 at 02:38:01PM +0200, [email protected] wrote:
> On Mon, Jul 27, 2020 at 11:59:24AM +0100, [email protected] wrote:
> > From: Vincent Donnefort <[email protected]>
> >
> > Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
> > factorize the u64 vminruntime and last_update_time read on a 32-bits
> > architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
> > synchronization and therefore, have a small penalty in set_task_rq_fair()
> > and init_cfs_rq().
> >
> > The choice of using a macro over an inline function is driven by the
> > conditional u64 variable copy declarations.
> >
> > #ifndef CONFIG_64BIT
> > u64 [vminruntime|last_update_time]_copy;
> > #endif
>
> This lacks a *why*... why did you get up this morning and wrote us this
> patch.
>
>
For 32-bit architectures, both min_vruntime and last_update_time are using
similar access. This patch is simply an attempt to unify their usage by
introducing two macros to rely on when accessing those. At the same time, it
brings a comment regarding the barriers usage, as per the kernel policy. So
overall this is just a clean-up without any functional changes.
--
Vincent.
Hi Vincent,
On 7/27/20 11:59 AM, [email protected] wrote:
> From: Vincent Donnefort <[email protected]>
>
> Introducing two macro helpers u64_32read() and u64_32read_set_copy() to
> factorize the u64 vminruntime and last_update_time read on a 32-bits
> architecture. Those new helpers encapsulate smp_rmb() and smp_wmb()
> synchronization and therefore, have a small penalty in set_task_rq_fair()
> and init_cfs_rq().
>
> The choice of using a macro over an inline function is driven by the
> conditional u64 variable copy declarations.
>
> #ifndef CONFIG_64BIT
> u64 [vminruntime|last_update_time]_copy;
> #endif
>
> Signed-off-by: Vincent Donnefort <[email protected]>
>
I've run it on 32-bit ARM big.LITTLE platform (odroid xu3) with
the CONFIG_FAIR_GROUP_SCHED and CONFIG_CGROUP_SCHED.
I haven't observed any issues while running hackbench or booting
the kernel, the performance of affected function
set_task_rq_fair() is the same.
Function profile results after hackbench test:
--------w/ Vincent's patch + performance cpufreq gov------------
root@odroid:/sys/kernel/debug/tracing# cat trace_stat/function?
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 4068 3753.185 us 0.922
us 1.239 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 4492 4180.133 us 0.930
us 2.318 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 4208 3991.386 us 0.948
us 13.552 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 4753 4432.231 us 0.932
us 5.875 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 7980 5037.096 us 0.631
us 1.690 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 8143 5078.515 us 0.623
us 2.930 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 9721 6477.904 us 0.666
us 2.425 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 7743 4896.002 us 0.632
us 1.188 us
-----------w/o Vincent patch, performance cpufreq gov------------
root@odroid:/sys/kernel/debug/tracing# cat trace_stat/function?
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 6525 5830.450 us 0.893
us 3.213 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 6646 6069.444 us 0.913
us 9.651 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 5988 5646.133 us 0.942
us 7.685 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 5883 5390.103 us 0.916
us 29.283 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 3844 2561.186 us 0.666
us 0.933 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 5515 3491.011 us 0.633
us 9.845 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 6947 4808.522 us 0.692
us 5.822 us
Function Hit Time Avg
s^2
-------- --- ---- ---
---
set_task_rq_fair 4530 2810.000 us 0.620
us 0.554 us
Tested-by: Lukasz Luba <[email protected]>
Regards,
Lukasz
On Mon, Jul 27, 2020 at 04:23:03PM +0100, Vincent Donnefort wrote:
> For 32-bit architectures, both min_vruntime and last_update_time are using
> similar access. This patch is simply an attempt to unify their usage by
> introducing two macros to rely on when accessing those. At the same time, it
> brings a comment regarding the barriers usage, as per the kernel policy. So
> overall this is just a clean-up without any functional changes.
Ah, I though there was perhaps the idea to make use of armv7-lpae
instructions.
Aside of that, I think we need to spend a little time bike-shedding the
API/naming here:
> +# define u64_32read(val, copy) (val)
> +# define u64_32read_set_copy(val, copy) do { } while (0)
How about something like:
#ifdef CONFIG_64BIT
#define DEFINE_U64_U32(name) u64 name
#define u64_u32_load(name) name
#define u64_u32_store(name, val)name = val
#else
#define DEFINE_U64_U32(name) \
struct { \
u64 name; \
u64 name##_copy; \
}
#define u64_u32_load(name) \
({ \
u64 val, copy; \
do { \
val = name; \
smb_rmb(); \
copy = name##_copy; \
} while (val != copy); \
val;
})
#define u64_u32_store(name, val) \
do { \
typeof(val) __val = (val); \
name = __val; \
smp_wmb(); \
name##_copy = __val; \
} while (0)
#endif
On Tue, Jul 28, 2020 at 01:13:02PM +0200, [email protected] wrote:
> On Mon, Jul 27, 2020 at 04:23:03PM +0100, Vincent Donnefort wrote:
>
> > For 32-bit architectures, both min_vruntime and last_update_time are using
> > similar access. This patch is simply an attempt to unify their usage by
> > introducing two macros to rely on when accessing those. At the same time, it
> > brings a comment regarding the barriers usage, as per the kernel policy. So
> > overall this is just a clean-up without any functional changes.
>
> Ah, I though there was perhaps the idea to make use of armv7-lpae
> instructions.
>
> Aside of that, I think we need to spend a little time bike-shedding the
> API/naming here:
>
> > +# define u64_32read(val, copy) (val)
> > +# define u64_32read_set_copy(val, copy) do { } while (0)
>
> How about something like:
>
> #ifdef CONFIG_64BIT
>
> #define DEFINE_U64_U32(name) u64 name
> #define u64_u32_load(name) name
> #define u64_u32_store(name, val)name = val
>
> #else
>
> #define DEFINE_U64_U32(name) \
> struct { \
> u64 name; \
> u64 name##_copy; \
> }
>
> #define u64_u32_load(name) \
> ({ \
> u64 val, copy; \
> do { \
> val = name; \
> smb_rmb(); \
> copy = name##_copy; \
> } while (val != copy); \
wrong order there; we should first read _copy and then the regular one
of course.
> val;
> })
>
> #define u64_u32_store(name, val) \
> do { \
> typeof(val) __val = (val); \
> name = __val; \
> smp_wmb(); \
> name##_copy = __val; \
> } while (0)
>
> #endif
The other approach is making it a full type and inline functions I
suppose.
Hi,
On Tue, Jul 28, 2020 at 02:00:27PM +0200, [email protected] wrote:
> On Tue, Jul 28, 2020 at 01:13:02PM +0200, [email protected] wrote:
> > On Mon, Jul 27, 2020 at 04:23:03PM +0100, Vincent Donnefort wrote:
> >
> > > For 32-bit architectures, both min_vruntime and last_update_time are using
> > > similar access. This patch is simply an attempt to unify their usage by
> > > introducing two macros to rely on when accessing those. At the same time, it
> > > brings a comment regarding the barriers usage, as per the kernel policy. So
> > > overall this is just a clean-up without any functional changes.
> >
> > Ah, I though there was perhaps the idea to make use of armv7-lpae
> > instructions.
> >
> > Aside of that, I think we need to spend a little time bike-shedding the
> > API/naming here:
> >
> > > +# define u64_32read(val, copy) (val)
> > > +# define u64_32read_set_copy(val, copy) do { } while (0)
> >
> > How about something like:
> >
> > #ifdef CONFIG_64BIT
> >
> > #define DEFINE_U64_U32(name) u64 name
> > #define u64_u32_load(name) name
> > #define u64_u32_store(name, val)name = val
> >
> > #else
> >
> > #define DEFINE_U64_U32(name) \
> > struct { \
> > u64 name; \
> > u64 name##_copy; \
> > }
> >
> > #define u64_u32_load(name) \
> > ({ \
> > u64 val, copy; \
> > do { \
> > val = name; \
> > smb_rmb(); \
> > copy = name##_copy; \
> > } while (val != copy); \
>
> wrong order there; we should first read _copy and then the regular one
> of course.
>
> > val;
> > })
> >
> > #define u64_u32_store(name, val) \
> > do { \
> > typeof(val) __val = (val); \
> > name = __val; \
> > smp_wmb(); \
> > name##_copy = __val; \
> > } while (0)
> >
> > #endif
>
> The other approach is making it a full type and inline functions I
> suppose.
I didn't pick this approach originally, as I thought it would be way too
invasive. If the API looks way cleaner, it nonetheless doesn't match
last_update_time usage. The variable is declared in sched_avg while its copy
is in struct cfs_rq.
Moving the copy in sched_avg would mean:
* Setting the copy for all struct sched_avg in ___update_load_sum(), instead
of just the cfs_rq.avg in update_cfs_rq_load_avg().
* Having the DEFINE_U64_U32 declaration in include/linux/sched.h to cover
struct sched_avg.
--
Vincent
Hi Peter,
[...]
> > >
> > > How about something like:
> > >
> > > #ifdef CONFIG_64BIT
> > >
> > > #define DEFINE_U64_U32(name) u64 name
> > > #define u64_u32_load(name) name
> > > #define u64_u32_store(name, val)name = val
> > >
> > > #else
> > >
> > > #define DEFINE_U64_U32(name) \
> > > struct { \
> > > u64 name; \
> > > u64 name##_copy; \
> > > }
> > >
> > > #define u64_u32_load(name) \
> > > ({ \
> > > u64 val, copy; \
> > > do { \
> > > val = name; \
> > > smb_rmb(); \
> > > copy = name##_copy; \
> > > } while (val != copy); \
> >
> > wrong order there; we should first read _copy and then the regular one
> > of course.
> >
> > > val;
> > > })
> > >
> > > #define u64_u32_store(name, val) \
> > > do { \
> > > typeof(val) __val = (val); \
> > > name = __val; \
> > > smp_wmb(); \
> > > name##_copy = __val; \
> > > } while (0)
> > >
> > > #endif
> >
> > The other approach is making it a full type and inline functions I
> > suppose.
>
> I didn't pick this approach originally, as I thought it would be way too
> invasive. If the API looks way cleaner, it nonetheless doesn't match
> last_update_time usage. The variable is declared in sched_avg while its copy
> is in struct cfs_rq.
>
> Moving the copy in sched_avg would mean:
>
> * Setting the copy for all struct sched_avg in ___update_load_sum(), instead
> of just the cfs_rq.avg in update_cfs_rq_load_avg().
>
> * Having the DEFINE_U64_U32 declaration in include/linux/sched.h to cover
> struct sched_avg.
Gentle ping