C0.2 state in umwait and tpause instructions can be enabled or disabled
on a processor through IA32_UMWAIT_CONTROL MSR register.
By default, C0.2 is enabled and the user wait instructions result in
lower power consumption with slower wakeup time.
But in real time systems which require faster wakeup time although power
savings could be smaller, the administrator needs to disable C0.2 and all
C0.2 requests from user applications revert to C0.1.
A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
created to allow the administrator to control C0.2 state during run time.
Signed-off-by: Fenghua Yu <[email protected]>
Reviewed-by: Ashok Raj <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
---
arch/x86/power/umwait.c | 89 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 88 insertions(+), 1 deletion(-)
diff --git a/arch/x86/power/umwait.c b/arch/x86/power/umwait.c
index 23151e77c138..9c176f3e59b6 100644
--- a/arch/x86/power/umwait.c
+++ b/arch/x86/power/umwait.c
@@ -11,10 +11,18 @@
*/
static u32 umwait_control_cached = 100000;
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR
+ * to guarantee all CPUs have the same MSR value.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
/* Set up IA32_UMWAIT_CONTROL MSR on CPU using the current global setting. */
static int umwait_cpu_online(unsigned int cpu)
{
+ mutex_lock(&umwait_lock);
wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0);
+ mutex_unlock(&umwait_lock);
return 0;
}
@@ -30,6 +38,7 @@ static int umwait_cpu_online(unsigned int cpu)
*/
static void umwait_syscore_resume(void)
{
+ /* No need to lock because only BP is running now. */
wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0);
}
@@ -37,17 +46,95 @@ static struct syscore_ops umwait_syscore_ops = {
.resume = umwait_syscore_resume,
};
+static void umwait_control_msr_update(void *unused)
+{
+ wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0);
+}
+
+static u32 get_umwait_control_c02(void)
+{
+ return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
+}
+
+static u32 get_umwait_control_max_time(void)
+{
+ return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ /*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled. Show the opposite of bit 0.
+ */
+ return sprintf(buf, "%d\n", !(bool)get_umwait_control_c02());
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 umwait_control_c02;
+ bool c02_enabled;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enabled);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ /*
+ * The value of bit 0 in IA32_UMWAIT_CONTROL MSR is opposite of
+ * c02_enabled.
+ */
+ umwait_control_c02 = (u32)!c02_enabled;
+ if (umwait_control_c02 == get_umwait_control_c02())
+ goto out_unlock;
+
+ umwait_control_cached = umwait_control_c02 | get_umwait_control_max_time();
+ /* Enable/disable C0.2 state on all CPUs */
+ on_each_cpu(umwait_control_msr_update, NULL, 1);
+
+out_unlock:
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
static int __init umwait_init(void)
{
+ struct device *dev;
int ret;
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
return -ENODEV;
+ /* Add umwait control interface. */
+ dev = cpu_subsys.dev_root;
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ if (ret)
+ return ret;
+
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait/intel:online",
umwait_cpu_online, NULL);
- if (ret < 0)
+ if (ret < 0) {
+ sysfs_remove_group(&dev->kobj, &umwait_attr_group);
+
return ret;
+ }
register_syscore_ops(&umwait_syscore_ops);
--
2.19.1
On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
>
> C0.2 state in umwait and tpause instructions can be enabled or disabled
> on a processor through IA32_UMWAIT_CONTROL MSR register.
>
> +static u32 get_umwait_control_c02(void)
> +{
> + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> +}
> +
> +static u32 get_umwait_control_max_time(void)
> +{
> + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> +}
> +
I'm not convinced that these helpers make the code any more readable.
On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
>
> C0.2 state in umwait and tpause instructions can be enabled or disabled
> on a processor through IA32_UMWAIT_CONTROL MSR register.
>
> By default, C0.2 is enabled and the user wait instructions result in
> lower power consumption with slower wakeup time.
>
> But in real time systems which require faster wakeup time although power
> savings could be smaller, the administrator needs to disable C0.2 and all
> C0.2 requests from user applications revert to C0.1.
>
> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> created to allow the administrator to control C0.2 state during run time.
This looks better than the previous version. I think the locking is
still rather confused. You have a mutex that you hold while changing
the value, which is entirely reasonable. But, of the code paths that
write the MSR, only one takes the mutex.
I think you should consider making a function that just does:
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
and using it in all the places that update the MSR. The only thing
that should need the lock is the sysfs code to avoid accidentally
corrupting the value, but that code should also use WRITE_ONCE to do
its update.
On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> >
> > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > on a processor through IA32_UMWAIT_CONTROL MSR register.
> >
> > By default, C0.2 is enabled and the user wait instructions result in
> > lower power consumption with slower wakeup time.
> >
> > But in real time systems which require faster wakeup time although power
> > savings could be smaller, the administrator needs to disable C0.2 and all
> > C0.2 requests from user applications revert to C0.1.
> >
> > A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > created to allow the administrator to control C0.2 state during run time.
>
> This looks better than the previous version. I think the locking is
> still rather confused. You have a mutex that you hold while changing
> the value, which is entirely reasonable. But, of the code paths that
> write the MSR, only one takes the mutex.
>
> I think you should consider making a function that just does:
>
> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
>
> and using it in all the places that update the MSR. The only thing
> that should need the lock is the sysfs code to avoid accidentally
> corrupting the value, but that code should also use WRITE_ONCE to do
> its update.
Based on the comment, the illustrative CPU online and enable_c02 store
functions would be:
umwait_cpu_online()
{
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
return 0;
}
enable_c02_store()
{
mutex_lock(&umwait_lock);
umwait_control_c02 = (u32)!c02_enabled;
WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
on_each_cpu(umwait_control_msr_update, NULL, 1);
mutex_unlock(&umwait_lock);
}
Then suppose umwait_control_cached = 100000 initially and only CPU0 is
running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
onlining CPU1 in the same time:
1. On CPU1, read umwait_control_cached to eax as 100000 in
umwait_cpu_online()
2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
4. On CPU0, wrmsr with 100001 in enabled_c02_store()
The result is CPU0 and CPU1 have different MSR values.
The problem is because there is no wrmsr serialization b/w uwait_cpu_online()
and enable_c02_store(). The WRITE_ONCE() and READ_ONCE() only serialize
access to umwait_control_cached. But we need to serialize wrmsr() as well to
guarantee all CPUs have the same MSR value.
So does it make sense to keep the mutex and locking as the current patch does?
Thanks.
-Fenghua
On Sat, Jun 08, 2019 at 03:52:03PM -0700, Andy Lutomirski wrote:
> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> >
> > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > on a processor through IA32_UMWAIT_CONTROL MSR register.
> >
>
> > +static u32 get_umwait_control_c02(void)
> > +{
> > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> > +}
> > +
> > +static u32 get_umwait_control_max_time(void)
> > +{
> > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > +}
> > +
>
> I'm not convinced that these helpers make the code any more readable.
The helpers reduce length of statements that call them. Otherwise, all of
the statements would be easily over 80 characters.
Plus, each of the helpers is called multiple places in #0003 and #0004.
So the helpers make the patches smaller and cleaner.
So is it still OK to keep the helpers?
Thanks.
-Fenghua
On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
>
> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > >
> > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > >
> > > By default, C0.2 is enabled and the user wait instructions result in
> > > lower power consumption with slower wakeup time.
> > >
> > > But in real time systems which require faster wakeup time although power
> > > savings could be smaller, the administrator needs to disable C0.2 and all
> > > C0.2 requests from user applications revert to C0.1.
> > >
> > > A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > created to allow the administrator to control C0.2 state during run time.
> >
> > This looks better than the previous version. I think the locking is
> > still rather confused. You have a mutex that you hold while changing
> > the value, which is entirely reasonable. But, of the code paths that
> > write the MSR, only one takes the mutex.
> >
> > I think you should consider making a function that just does:
> >
> > wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> >
> > and using it in all the places that update the MSR. The only thing
> > that should need the lock is the sysfs code to avoid accidentally
> > corrupting the value, but that code should also use WRITE_ONCE to do
> > its update.
>
> Based on the comment, the illustrative CPU online and enable_c02 store
> functions would be:
>
> umwait_cpu_online()
> {
> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> return 0;
> }
>
> enable_c02_store()
> {
> mutex_lock(&umwait_lock);
> umwait_control_c02 = (u32)!c02_enabled;
> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> on_each_cpu(umwait_control_msr_update, NULL, 1);
> mutex_unlock(&umwait_lock);
> }
>
> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> onlining CPU1 in the same time:
>
> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> umwait_cpu_online()
> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
>
> The result is CPU0 and CPU1 have different MSR values.
Yes, but only transiently, because you didn't finish your example.
Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
On Sun, Jun 9, 2019 at 9:14 PM Fenghua Yu <[email protected]> wrote:
>
> On Sat, Jun 08, 2019 at 03:52:03PM -0700, Andy Lutomirski wrote:
> > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > >
> > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > >
> >
> > > +static u32 get_umwait_control_c02(void)
> > > +{
> > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> > > +}
> > > +
> > > +static u32 get_umwait_control_max_time(void)
> > > +{
> > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > > +}
> > > +
> >
> > I'm not convinced that these helpers make the code any more readable.
>
> The helpers reduce length of statements that call them. Otherwise, all of
> the statements would be easily over 80 characters.
>
> Plus, each of the helpers is called multiple places in #0003 and #0004.
> So the helpers make the patches smaller and cleaner.
>
I was imagining things like:
umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_C02;
if (whatever condition)
umwait_control_cached |= MSR_IA32_UMWAIT_CONTROL_C02;
umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
umwait_control_cached |= new_max_time;
You could save 8 characters by just calling the variable umwait_control.
On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> >
> > On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > >
> > > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > >
> > > > By default, C0.2 is enabled and the user wait instructions result in
> > > > lower power consumption with slower wakeup time.
> > > >
> > > > But in real time systems which require faster wakeup time although power
> > > > savings could be smaller, the administrator needs to disable C0.2 and all
> > > > C0.2 requests from user applications revert to C0.1.
> > > >
> > > > A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > > created to allow the administrator to control C0.2 state during run time.
> > >
> > > This looks better than the previous version. I think the locking is
> > > still rather confused. You have a mutex that you hold while changing
> > > the value, which is entirely reasonable. But, of the code paths that
> > > write the MSR, only one takes the mutex.
> > >
> > > I think you should consider making a function that just does:
> > >
> > > wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > >
> > > and using it in all the places that update the MSR. The only thing
> > > that should need the lock is the sysfs code to avoid accidentally
> > > corrupting the value, but that code should also use WRITE_ONCE to do
> > > its update.
> >
> > Based on the comment, the illustrative CPU online and enable_c02 store
> > functions would be:
> >
> > umwait_cpu_online()
> > {
> > wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > return 0;
> > }
> >
> > enable_c02_store()
> > {
> > mutex_lock(&umwait_lock);
> > umwait_control_c02 = (u32)!c02_enabled;
> > WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > on_each_cpu(umwait_control_msr_update, NULL, 1);
> > mutex_unlock(&umwait_lock);
> > }
> >
> > Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > onlining CPU1 in the same time:
> >
> > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > umwait_cpu_online()
> > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> >
> > The result is CPU0 and CPU1 have different MSR values.
>
> Yes, but only transiently, because you didn't finish your example.
>
> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
There is no sync on wrmsr on CPU0 and CPU1. So a better sequence to
describe the problem is changing the order of wrmsr:
1. On CPU1, read umwait_control_cached to eax as 100000 in
umwait_cpu_online()
2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
So CPU1 and CPU0 have different MSR values. This won't be transient.
So we do need the mutex as in the current patch, right?
Thanks.
-Fenghua
> On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
>
>> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
>>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
>>>
>>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
>>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
>>>>>
>>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
>>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
>>>>>
>>>>> By default, C0.2 is enabled and the user wait instructions result in
>>>>> lower power consumption with slower wakeup time.
>>>>>
>>>>> But in real time systems which require faster wakeup time although power
>>>>> savings could be smaller, the administrator needs to disable C0.2 and all
>>>>> C0.2 requests from user applications revert to C0.1.
>>>>>
>>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
>>>>> created to allow the administrator to control C0.2 state during run time.
>>>>
>>>> This looks better than the previous version. I think the locking is
>>>> still rather confused. You have a mutex that you hold while changing
>>>> the value, which is entirely reasonable. But, of the code paths that
>>>> write the MSR, only one takes the mutex.
>>>>
>>>> I think you should consider making a function that just does:
>>>>
>>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
>>>>
>>>> and using it in all the places that update the MSR. The only thing
>>>> that should need the lock is the sysfs code to avoid accidentally
>>>> corrupting the value, but that code should also use WRITE_ONCE to do
>>>> its update.
>>>
>>> Based on the comment, the illustrative CPU online and enable_c02 store
>>> functions would be:
>>>
>>> umwait_cpu_online()
>>> {
>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
>>> return 0;
>>> }
>>>
>>> enable_c02_store()
>>> {
>>> mutex_lock(&umwait_lock);
>>> umwait_control_c02 = (u32)!c02_enabled;
>>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
>>> on_each_cpu(umwait_control_msr_update, NULL, 1);
>>> mutex_unlock(&umwait_lock);
>>> }
>>>
>>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
>>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
>>> onlining CPU1 in the same time:
>>>
>>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
>>> umwait_cpu_online()
>>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
>>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
>>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
>>>
>>> The result is CPU0 and CPU1 have different MSR values.
>>
>> Yes, but only transiently, because you didn't finish your example.
>>
>> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
>
> There is no sync on wrmsr on CPU0 and CPU1.
What do you mean by sync?
> So a better sequence to
> describe the problem is changing the order of wrmsr:
>
> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> umwait_cpu_online()
> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
>
> So CPU1 and CPU0 have different MSR values. This won't be transient.
You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
On Fri, Jun 07, 2019 at 03:00:35PM -0700, Fenghua Yu wrote:
> C0.2 state in umwait and tpause instructions can be enabled or disabled
> on a processor through IA32_UMWAIT_CONTROL MSR register.
>
> By default, C0.2 is enabled and the user wait instructions result in
> lower power consumption with slower wakeup time.
>
> But in real time systems which require faster wakeup time although power
> savings could be smaller, the administrator needs to disable C0.2 and all
> C0.2 requests from user applications revert to C0.1.
>
> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> created to allow the administrator to control C0.2 state during run time.
We already have an interface for applications to convey their latency
requirements (pm-qos). We do not need another magic sys variable.
> On Jun 11, 2019, at 1:54 AM, Peter Zijlstra <[email protected]> wrote:
>
>> On Fri, Jun 07, 2019 at 03:00:35PM -0700, Fenghua Yu wrote:
>> C0.2 state in umwait and tpause instructions can be enabled or disabled
>> on a processor through IA32_UMWAIT_CONTROL MSR register.
>>
>> By default, C0.2 is enabled and the user wait instructions result in
>> lower power consumption with slower wakeup time.
>>
>> But in real time systems which require faster wakeup time although power
>> savings could be smaller, the administrator needs to disable C0.2 and all
>> C0.2 requests from user applications revert to C0.1.
>>
>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
>> created to allow the administrator to control C0.2 state during run time.
>
> We already have an interface for applications to convey their latency
> requirements (pm-qos). We do not need another magic sys variable.
I’m not sure I agree. This isn’t an overall latency request, and setting an absurdly low pm_qos will badly hurt idle power and turbo performance. Also, pm_qos isn’t exactly beautiful.
(I speak from some experience. I may be literally the only person to write a driver that listens to dev_pm_qos latency requests. And, in my production box, I directly disable c states instead of messing with pm_qos.)
I do wonder whether anyone will ever use this particular control, though.
(can you, perchance, look at a MUA that isn't 'broken' ?)
On Tue, Jun 11, 2019 at 09:04:30AM -0700, Andy Lutomirski wrote:
>
>
> > On Jun 11, 2019, at 1:54 AM, Peter Zijlstra <[email protected]> wrote:
> >
> >> On Fri, Jun 07, 2019 at 03:00:35PM -0700, Fenghua Yu wrote:
> >> C0.2 state in umwait and tpause instructions can be enabled or disabled
> >> on a processor through IA32_UMWAIT_CONTROL MSR register.
> >>
> >> By default, C0.2 is enabled and the user wait instructions result in
> >> lower power consumption with slower wakeup time.
> >>
> >> But in real time systems which require faster wakeup time although power
> >> savings could be smaller, the administrator needs to disable C0.2 and all
> >> C0.2 requests from user applications revert to C0.1.
> >>
> >> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> >> created to allow the administrator to control C0.2 state during run time.
> >
> > We already have an interface for applications to convey their latency
> > requirements (pm-qos). We do not need another magic sys variable.
>
> I’m not sure I agree. This isn’t an overall latency request, and
> setting an absurdly low pm_qos will badly hurt idle power and turbo
> performance. Also, pm_qos isn’t exactly beautiful.
>
> (I speak from some experience. I may be literally the only person to
> write a driver that listens to dev_pm_qos latency requests. And, in my
> production box, I directly disable c states instead of messing with
> pm_qos.)
>
> I do wonder whether anyone will ever use this particular control, though.
I agree that pm-qos is pretty terrible; but that doesn't mean we should
just add random control files all over the place.
On Tue, Jun 11, 2019 at 10:27 AM Peter Zijlstra <[email protected]> wrote:
>
>
> (can you, perchance, look at a MUA that isn't 'broken' ?)
>
> On Tue, Jun 11, 2019 at 09:04:30AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Jun 11, 2019, at 1:54 AM, Peter Zijlstra <[email protected]> wrote:
> > >
> > >> On Fri, Jun 07, 2019 at 03:00:35PM -0700, Fenghua Yu wrote:
> > >> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > >> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > >>
> > >> By default, C0.2 is enabled and the user wait instructions result in
> > >> lower power consumption with slower wakeup time.
> > >>
> > >> But in real time systems which require faster wakeup time although power
> > >> savings could be smaller, the administrator needs to disable C0.2 and all
> > >> C0.2 requests from user applications revert to C0.1.
> > >>
> > >> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > >> created to allow the administrator to control C0.2 state during run time.
> > >
> > > We already have an interface for applications to convey their latency
> > > requirements (pm-qos). We do not need another magic sys variable.
> >
> > I’m not sure I agree. This isn’t an overall latency request, and
> > setting an absurdly low pm_qos will badly hurt idle power and turbo
> > performance. Also, pm_qos isn’t exactly beautiful.
> >
> > (I speak from some experience. I may be literally the only person to
> > write a driver that listens to dev_pm_qos latency requests. And, in my
> > production box, I directly disable c states instead of messing with
> > pm_qos.)
> >
> > I do wonder whether anyone will ever use this particular control, though.
>
> I agree that pm-qos is pretty terrible; but that doesn't mean we should
> just add random control files all over the place.
I don't think pm-qos is expressive enough. It seems entirely
reasonable to want to do a C0.1 wait for lower latency *while waiting*
but still want full power-saving idle when not waiting.
Do we even know what the C0.2 and C0.1 latencies are? And why is this
thing an MSR instead of a flag passed to UMWAIT?
--Andy
On Mon, Jun 17, 2019 at 08:14:44AM -0700, Andy Lutomirski wrote:
> On Tue, Jun 11, 2019 at 10:27 AM Peter Zijlstra <[email protected]> wrote:
> >
> >
> > (can you, perchance, look at a MUA that isn't 'broken' ?)
> >
> > On Tue, Jun 11, 2019 at 09:04:30AM -0700, Andy Lutomirski wrote:
> > >
> > >
> > > > On Jun 11, 2019, at 1:54 AM, Peter Zijlstra <[email protected]> wrote:
> > > >
> > > >> On Fri, Jun 07, 2019 at 03:00:35PM -0700, Fenghua Yu wrote:
> > > >> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > >> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > >>
> > > >> By default, C0.2 is enabled and the user wait instructions result in
> > > >> lower power consumption with slower wakeup time.
> > > >>
> > > >> But in real time systems which require faster wakeup time although power
> > > >> savings could be smaller, the administrator needs to disable C0.2 and all
> > > >> C0.2 requests from user applications revert to C0.1.
> > > >>
> > > >> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > >> created to allow the administrator to control C0.2 state during run time.
> > > >
> > > > We already have an interface for applications to convey their latency
> > > > requirements (pm-qos). We do not need another magic sys variable.
> > >
> > > I’m not sure I agree. This isn’t an overall latency request, and
> > > setting an absurdly low pm_qos will badly hurt idle power and turbo
> > > performance. Also, pm_qos isn’t exactly beautiful.
> > >
> > > (I speak from some experience. I may be literally the only person to
> > > write a driver that listens to dev_pm_qos latency requests. And, in my
> > > production box, I directly disable c states instead of messing with
> > > pm_qos.)
> > >
> > > I do wonder whether anyone will ever use this particular control, though.
> >
> > I agree that pm-qos is pretty terrible; but that doesn't mean we should
> > just add random control files all over the place.
>
> I don't think pm-qos is expressive enough. It seems entirely
> reasonable to want to do a C0.1 wait for lower latency *while waiting*
> but still want full power-saving idle when not waiting.
>
> Do we even know what the C0.2 and C0.1 latencies are? And why is this
> thing an MSR instead of a flag passed to UMWAIT?
I will still keep this sysfs interface in the next version of patches, right?
Thanks.
-Fenghua
On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
>
>
> > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> >
> >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> >>>
> >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> >>>>>
> >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> >>>>>
> >>>>> By default, C0.2 is enabled and the user wait instructions result in
> >>>>> lower power consumption with slower wakeup time.
> >>>>>
> >>>>> But in real time systems which require faster wakeup time although power
> >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> >>>>> C0.2 requests from user applications revert to C0.1.
> >>>>>
> >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> >>>>> created to allow the administrator to control C0.2 state during run time.
> >>>>
> >>>> This looks better than the previous version. I think the locking is
> >>>> still rather confused. You have a mutex that you hold while changing
> >>>> the value, which is entirely reasonable. But, of the code paths that
> >>>> write the MSR, only one takes the mutex.
> >>>>
> >>>> I think you should consider making a function that just does:
> >>>>
> >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> >>>>
> >>>> and using it in all the places that update the MSR. The only thing
> >>>> that should need the lock is the sysfs code to avoid accidentally
> >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> >>>> its update.
> >>>
> >>> Based on the comment, the illustrative CPU online and enable_c02 store
> >>> functions would be:
> >>>
> >>> umwait_cpu_online()
> >>> {
> >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> >>> return 0;
> >>> }
> >>>
> >>> enable_c02_store()
> >>> {
> >>> mutex_lock(&umwait_lock);
> >>> umwait_control_c02 = (u32)!c02_enabled;
> >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> >>> mutex_unlock(&umwait_lock);
> >>> }
> >>>
> >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> >>> onlining CPU1 in the same time:
> >>>
> >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> >>> umwait_cpu_online()
> >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> >>>
> >>> The result is CPU0 and CPU1 have different MSR values.
> >>
> >> Yes, but only transiently, because you didn't finish your example.
> >>
> >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> >
> > There is no sync on wrmsr on CPU0 and CPU1.
>
> What do you mean by sync?
>
> > So a better sequence to
> > describe the problem is changing the order of wrmsr:
> >
> > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > umwait_cpu_online()
> > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> >
> > So CPU1 and CPU0 have different MSR values. This won't be transient.
>
> You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
>
Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
is going to be online:
1. On CPU1, cpu_online_mask=0x3 in start_secondary()
2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
4. On CPU0, execute one_each_cpu() in enabled_c02_store():
wrmsr with 100001 on CPU0
wrmsr with 100001 on CPU1
5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
the CPUs.
Is this a right sequence to demonstrate locking issue without the mutex
locking?
Thanks.
-Fenghua
On Sun, Jun 09, 2019 at 09:26:29PM -0700, Andy Lutomirski wrote:
> On Sun, Jun 9, 2019 at 9:14 PM Fenghua Yu <[email protected]> wrote:
> >
> > On Sat, Jun 08, 2019 at 03:52:03PM -0700, Andy Lutomirski wrote:
> > > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > >
> > > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > >
> > >
> > > > +static u32 get_umwait_control_c02(void)
> > > > +{
> > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> > > > +}
> > > > +
> > > > +static u32 get_umwait_control_max_time(void)
> > > > +{
> > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > > > +}
> > > > +
> > >
> > > I'm not convinced that these helpers make the code any more readable.
> >
> > The helpers reduce length of statements that call them. Otherwise, all of
> > the statements would be easily over 80 characters.
> >
> > Plus, each of the helpers is called multiple places in #0003 and #0004.
> > So the helpers make the patches smaller and cleaner.
> >
>
> I was imagining things like:
>
> umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_C02;
> if (whatever condition)
> umwait_control_cached |= MSR_IA32_UMWAIT_CONTROL_C02;
> umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> umwait_control_cached |= new_max_time;
How about this statement?
With the helpers:
umwait_control_cached = max_time | get_umwait_control_c02();
If there is no helpers, the above statement will need two statements:
umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
umwait_control_cached |= max_time;
Another example:
With the helpers:
if (umwait_control_c02 == get_umwait_control_c02())
If no helpers, the above statement will be long:
if (umwait_control_c02 == (umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02_DISABLED))
There are quite a few places like above examples.
The helpers can reduce the length of those long lines and make code more
readable and shorter, right?
Can I still keep the helpers?
Thanks.
-Fenghua
On Mon, Jun 17, 2019 at 3:57 PM Fenghua Yu <[email protected]> wrote:
>
> On Sun, Jun 09, 2019 at 09:26:29PM -0700, Andy Lutomirski wrote:
> > On Sun, Jun 9, 2019 at 9:14 PM Fenghua Yu <[email protected]> wrote:
> > >
> > > On Sat, Jun 08, 2019 at 03:52:03PM -0700, Andy Lutomirski wrote:
> > > > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > > >
> > > > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > > >
> > > >
> > > > > +static u32 get_umwait_control_c02(void)
> > > > > +{
> > > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> > > > > +}
> > > > > +
> > > > > +static u32 get_umwait_control_max_time(void)
> > > > > +{
> > > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > > > > +}
> > > > > +
> > > >
> > > > I'm not convinced that these helpers make the code any more readable.
> > >
> > > The helpers reduce length of statements that call them. Otherwise, all of
> > > the statements would be easily over 80 characters.
> > >
> > > Plus, each of the helpers is called multiple places in #0003 and #0004.
> > > So the helpers make the patches smaller and cleaner.
> > >
> >
> > I was imagining things like:
> >
> > umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_C02;
> > if (whatever condition)
> > umwait_control_cached |= MSR_IA32_UMWAIT_CONTROL_C02;
> > umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > umwait_control_cached |= new_max_time;
>
> How about this statement?
> With the helpers:
> umwait_control_cached = max_time | get_umwait_control_c02();
> If there is no helpers, the above statement will need two statements:
> umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> umwait_control_cached |= max_time;
>
> Another example:
> With the helpers:
> if (umwait_control_c02 == get_umwait_control_c02())
> If no helpers, the above statement will be long:
> if (umwait_control_c02 == (umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02_DISABLED))
>
> There are quite a few places like above examples.
>
> The helpers can reduce the length of those long lines and make code more
> readable and shorter, right?
>
> Can I still keep the helpers?
>
Sure, unless someone else objects.
On Mon, Jun 17, 2019 at 03:59:28PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 3:57 PM Fenghua Yu <[email protected]> wrote:
> >
> > On Sun, Jun 09, 2019 at 09:26:29PM -0700, Andy Lutomirski wrote:
> > > On Sun, Jun 9, 2019 at 9:14 PM Fenghua Yu <[email protected]> wrote:
> > > >
> > > > On Sat, Jun 08, 2019 at 03:52:03PM -0700, Andy Lutomirski wrote:
> > > > > On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > > > >
> > > > > > C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > > > on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > > > >
> > > > >
> > > > > > +static u32 get_umwait_control_c02(void)
> > > > > > +{
> > > > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02;
> > > > > > +}
> > > > > > +
> > > > > > +static u32 get_umwait_control_max_time(void)
> > > > > > +{
> > > > > > + return umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > > > > > +}
> > > > > > +
> > > > >
> > > > > I'm not convinced that these helpers make the code any more readable.
> > > >
> > > > The helpers reduce length of statements that call them. Otherwise, all of
> > > > the statements would be easily over 80 characters.
> > > >
> > > > Plus, each of the helpers is called multiple places in #0003 and #0004.
> > > > So the helpers make the patches smaller and cleaner.
> > > >
> > >
> > > I was imagining things like:
> > >
> > > umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_C02;
> > > if (whatever condition)
> > > umwait_control_cached |= MSR_IA32_UMWAIT_CONTROL_C02;
> > > umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > > umwait_control_cached |= new_max_time;
> >
> > How about this statement?
> > With the helpers:
> > umwait_control_cached = max_time | get_umwait_control_c02();
> > If there is no helpers, the above statement will need two statements:
> > umwait_control_cached &= ~MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
> > umwait_control_cached |= max_time;
> >
> > Another example:
> > With the helpers:
> > if (umwait_control_c02 == get_umwait_control_c02())
> > If no helpers, the above statement will be long:
> > if (umwait_control_c02 == (umwait_control_cached & MSR_IA32_UMWAIT_CONTROL_C02_DISABLED))
> >
> > There are quite a few places like above examples.
> >
> > The helpers can reduce the length of those long lines and make code more
> > readable and shorter, right?
> >
> > Can I still keep the helpers?
> >
>
> Sure, unless someone else objects.
Thank you very much for your advice!
-Fenghua
On Mon, Jun 17, 2019 at 1:36 PM Fenghua Yu <[email protected]> wrote:
>
> On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
> >
> >
> > > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> > >
> > >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> > >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> > >>>
> > >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > >>>>>
> > >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > >>>>>
> > >>>>> By default, C0.2 is enabled and the user wait instructions result in
> > >>>>> lower power consumption with slower wakeup time.
> > >>>>>
> > >>>>> But in real time systems which require faster wakeup time although power
> > >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> > >>>>> C0.2 requests from user applications revert to C0.1.
> > >>>>>
> > >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > >>>>> created to allow the administrator to control C0.2 state during run time.
> > >>>>
> > >>>> This looks better than the previous version. I think the locking is
> > >>>> still rather confused. You have a mutex that you hold while changing
> > >>>> the value, which is entirely reasonable. But, of the code paths that
> > >>>> write the MSR, only one takes the mutex.
> > >>>>
> > >>>> I think you should consider making a function that just does:
> > >>>>
> > >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > >>>>
> > >>>> and using it in all the places that update the MSR. The only thing
> > >>>> that should need the lock is the sysfs code to avoid accidentally
> > >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> > >>>> its update.
> > >>>
> > >>> Based on the comment, the illustrative CPU online and enable_c02 store
> > >>> functions would be:
> > >>>
> > >>> umwait_cpu_online()
> > >>> {
> > >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > >>> return 0;
> > >>> }
> > >>>
> > >>> enable_c02_store()
> > >>> {
> > >>> mutex_lock(&umwait_lock);
> > >>> umwait_control_c02 = (u32)!c02_enabled;
> > >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> > >>> mutex_unlock(&umwait_lock);
> > >>> }
> > >>>
> > >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > >>> onlining CPU1 in the same time:
> > >>>
> > >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > >>> umwait_cpu_online()
> > >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> > >>>
> > >>> The result is CPU0 and CPU1 have different MSR values.
> > >>
> > >> Yes, but only transiently, because you didn't finish your example.
> > >>
> > >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> > >
> > > There is no sync on wrmsr on CPU0 and CPU1.
> >
> > What do you mean by sync?
> >
> > > So a better sequence to
> > > describe the problem is changing the order of wrmsr:
> > >
> > > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > umwait_cpu_online()
> > > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > >
> > > So CPU1 and CPU0 have different MSR values. This won't be transient.
> >
> > You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
> >
>
> Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
> is going to be online:
>
> 1. On CPU1, cpu_online_mask=0x3 in start_secondary()
> 2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
> 3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> 4. On CPU0, execute one_each_cpu() in enabled_c02_store():
> wrmsr with 100001 on CPU0
> wrmsr with 100001 on CPU1
> 5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
>
> So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
> the CPUs.
>
> Is this a right sequence to demonstrate locking issue without the mutex
> locking?
>
Fair enough. I would fix it differently, though:
static void update_this_cpu_umwait_msr(void)
{
WARN_ON_ONCE(!irqs_disabled()); /* or local_irq_save() */
/* We need to prevent umwait_control from being changed *and*
completing its WRMSR between our read and our WRMSR. By turning IRQs
off here, we ensure that no sysfs write happens on this CPU and we
also make sure that any concurrent sysfs write from a different CPU
will not finish updating us via IPI until we're done. */
wrmsrl(MSR_..., READ_ONCE(umwait_control), 0);
}
On Mon, Jun 17, 2019 at 04:02:50PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 1:36 PM Fenghua Yu <[email protected]> wrote:
> >
> > On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
> > >
> > >
> > > > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> > > >
> > > >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> > > >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> > > >>>
> > > >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > > >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > >>>>>
> > > >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > >>>>>
> > > >>>>> By default, C0.2 is enabled and the user wait instructions result in
> > > >>>>> lower power consumption with slower wakeup time.
> > > >>>>>
> > > >>>>> But in real time systems which require faster wakeup time although power
> > > >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> > > >>>>> C0.2 requests from user applications revert to C0.1.
> > > >>>>>
> > > >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > >>>>> created to allow the administrator to control C0.2 state during run time.
> > > >>>>
> > > >>>> This looks better than the previous version. I think the locking is
> > > >>>> still rather confused. You have a mutex that you hold while changing
> > > >>>> the value, which is entirely reasonable. But, of the code paths that
> > > >>>> write the MSR, only one takes the mutex.
> > > >>>>
> > > >>>> I think you should consider making a function that just does:
> > > >>>>
> > > >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > >>>>
> > > >>>> and using it in all the places that update the MSR. The only thing
> > > >>>> that should need the lock is the sysfs code to avoid accidentally
> > > >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> > > >>>> its update.
> > > >>>
> > > >>> Based on the comment, the illustrative CPU online and enable_c02 store
> > > >>> functions would be:
> > > >>>
> > > >>> umwait_cpu_online()
> > > >>> {
> > > >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > >>> return 0;
> > > >>> }
> > > >>>
> > > >>> enable_c02_store()
> > > >>> {
> > > >>> mutex_lock(&umwait_lock);
> > > >>> umwait_control_c02 = (u32)!c02_enabled;
> > > >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > > >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> > > >>> mutex_unlock(&umwait_lock);
> > > >>> }
> > > >>>
> > > >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > > >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > > >>> onlining CPU1 in the same time:
> > > >>>
> > > >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > >>> umwait_cpu_online()
> > > >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> > > >>>
> > > >>> The result is CPU0 and CPU1 have different MSR values.
> > > >>
> > > >> Yes, but only transiently, because you didn't finish your example.
> > > >>
> > > >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> > > >
> > > > There is no sync on wrmsr on CPU0 and CPU1.
> > >
> > > What do you mean by sync?
> > >
> > > > So a better sequence to
> > > > describe the problem is changing the order of wrmsr:
> > > >
> > > > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > umwait_cpu_online()
> > > > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > > > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > >
> > > > So CPU1 and CPU0 have different MSR values. This won't be transient.
> > >
> > > You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
> > >
> >
> > Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
> > is going to be online:
> >
> > 1. On CPU1, cpu_online_mask=0x3 in start_secondary()
> > 2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
> > 3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > 4. On CPU0, execute one_each_cpu() in enabled_c02_store():
> > wrmsr with 100001 on CPU0
> > wrmsr with 100001 on CPU1
> > 5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> >
> > So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
> > the CPUs.
> >
> > Is this a right sequence to demonstrate locking issue without the mutex
> > locking?
> >
>
> Fair enough. I would fix it differently, though:
>
> static void update_this_cpu_umwait_msr(void)
> {
> WARN_ON_ONCE(!irqs_disabled()); /* or local_irq_save() */
>
> /* We need to prevent umwait_control from being changed *and*
> completing its WRMSR between our read and our WRMSR. By turning IRQs
> off here, we ensure that no sysfs write happens on this CPU and we
> also make sure that any concurrent sysfs write from a different CPU
> will not finish updating us via IPI until we're done. */
> wrmsrl(MSR_..., READ_ONCE(umwait_control), 0);
> }
If no other objections, then I will keep the current mutex lock/unlock to
protect wrmsr and the umwait_control_cached variable.
Thanks.
-Fenghua
On Mon, Jun 17, 2019 at 4:20 PM Fenghua Yu <[email protected]> wrote:
>
> On Mon, Jun 17, 2019 at 04:02:50PM -0700, Andy Lutomirski wrote:
> > On Mon, Jun 17, 2019 at 1:36 PM Fenghua Yu <[email protected]> wrote:
> > >
> > > On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
> > > >
> > > >
> > > > > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> > > > >
> > > > >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> > > > >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> > > > >>>
> > > > >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > > > >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > > >>>>>
> > > > >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > > >>>>>
> > > > >>>>> By default, C0.2 is enabled and the user wait instructions result in
> > > > >>>>> lower power consumption with slower wakeup time.
> > > > >>>>>
> > > > >>>>> But in real time systems which require faster wakeup time although power
> > > > >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> > > > >>>>> C0.2 requests from user applications revert to C0.1.
> > > > >>>>>
> > > > >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > > >>>>> created to allow the administrator to control C0.2 state during run time.
> > > > >>>>
> > > > >>>> This looks better than the previous version. I think the locking is
> > > > >>>> still rather confused. You have a mutex that you hold while changing
> > > > >>>> the value, which is entirely reasonable. But, of the code paths that
> > > > >>>> write the MSR, only one takes the mutex.
> > > > >>>>
> > > > >>>> I think you should consider making a function that just does:
> > > > >>>>
> > > > >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > >>>>
> > > > >>>> and using it in all the places that update the MSR. The only thing
> > > > >>>> that should need the lock is the sysfs code to avoid accidentally
> > > > >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> > > > >>>> its update.
> > > > >>>
> > > > >>> Based on the comment, the illustrative CPU online and enable_c02 store
> > > > >>> functions would be:
> > > > >>>
> > > > >>> umwait_cpu_online()
> > > > >>> {
> > > > >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > >>> return 0;
> > > > >>> }
> > > > >>>
> > > > >>> enable_c02_store()
> > > > >>> {
> > > > >>> mutex_lock(&umwait_lock);
> > > > >>> umwait_control_c02 = (u32)!c02_enabled;
> > > > >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > > > >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> > > > >>> mutex_unlock(&umwait_lock);
> > > > >>> }
> > > > >>>
> > > > >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > > > >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > > > >>> onlining CPU1 in the same time:
> > > > >>>
> > > > >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > >>> umwait_cpu_online()
> > > > >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> > > > >>>
> > > > >>> The result is CPU0 and CPU1 have different MSR values.
> > > > >>
> > > > >> Yes, but only transiently, because you didn't finish your example.
> > > > >>
> > > > >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> > > > >
> > > > > There is no sync on wrmsr on CPU0 and CPU1.
> > > >
> > > > What do you mean by sync?
> > > >
> > > > > So a better sequence to
> > > > > describe the problem is changing the order of wrmsr:
> > > > >
> > > > > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > > umwait_cpu_online()
> > > > > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > > > > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > >
> > > > > So CPU1 and CPU0 have different MSR values. This won't be transient.
> > > >
> > > > You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
> > > >
> > >
> > > Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
> > > is going to be online:
> > >
> > > 1. On CPU1, cpu_online_mask=0x3 in start_secondary()
> > > 2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
> > > 3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > 4. On CPU0, execute one_each_cpu() in enabled_c02_store():
> > > wrmsr with 100001 on CPU0
> > > wrmsr with 100001 on CPU1
> > > 5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > >
> > > So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
> > > the CPUs.
> > >
> > > Is this a right sequence to demonstrate locking issue without the mutex
> > > locking?
> > >
> >
> > Fair enough. I would fix it differently, though:
> >
> > static void update_this_cpu_umwait_msr(void)
> > {
> > WARN_ON_ONCE(!irqs_disabled()); /* or local_irq_save() */
> >
> > /* We need to prevent umwait_control from being changed *and*
> > completing its WRMSR between our read and our WRMSR. By turning IRQs
> > off here, we ensure that no sysfs write happens on this CPU and we
> > also make sure that any concurrent sysfs write from a different CPU
> > will not finish updating us via IPI until we're done. */
> > wrmsrl(MSR_..., READ_ONCE(umwait_control), 0);
> > }
>
> If no other objections, then I will keep the current mutex lock/unlock to
> protect wrmsr and the umwait_control_cached variable.
>
I don't think that's sufficient. In your current code, you hold the
mutex in some places and not in others, and there's no explanation.
And I think you're relying on the IRQs-off protection in at least one
code path already, so you're not gaining any simplicity. At the very
least, you need to add some extensive comments everywhere if you want
to keep the mutex, but I think it's simpler and clearer if you just
use the same logic everywhere, for example, as I proposed above.
On Mon, Jun 17, 2019 at 04:41:38PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 4:20 PM Fenghua Yu <[email protected]> wrote:
> >
> > On Mon, Jun 17, 2019 at 04:02:50PM -0700, Andy Lutomirski wrote:
> > > On Mon, Jun 17, 2019 at 1:36 PM Fenghua Yu <[email protected]> wrote:
> > > >
> > > > On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
> > > > >
> > > > >
> > > > > > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> > > > > >
> > > > > >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> > > > > >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> > > > > >>>
> > > > > >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > > > > >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > > > >>>>>
> > > > > >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > > >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > > > >>>>>
> > > > > >>>>> By default, C0.2 is enabled and the user wait instructions result in
> > > > > >>>>> lower power consumption with slower wakeup time.
> > > > > >>>>>
> > > > > >>>>> But in real time systems which require faster wakeup time although power
> > > > > >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> > > > > >>>>> C0.2 requests from user applications revert to C0.1.
> > > > > >>>>>
> > > > > >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > > > >>>>> created to allow the administrator to control C0.2 state during run time.
> > > > > >>>>
> > > > > >>>> This looks better than the previous version. I think the locking is
> > > > > >>>> still rather confused. You have a mutex that you hold while changing
> > > > > >>>> the value, which is entirely reasonable. But, of the code paths that
> > > > > >>>> write the MSR, only one takes the mutex.
> > > > > >>>>
> > > > > >>>> I think you should consider making a function that just does:
> > > > > >>>>
> > > > > >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > > >>>>
> > > > > >>>> and using it in all the places that update the MSR. The only thing
> > > > > >>>> that should need the lock is the sysfs code to avoid accidentally
> > > > > >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> > > > > >>>> its update.
> > > > > >>>
> > > > > >>> Based on the comment, the illustrative CPU online and enable_c02 store
> > > > > >>> functions would be:
> > > > > >>>
> > > > > >>> umwait_cpu_online()
> > > > > >>> {
> > > > > >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > > >>> return 0;
> > > > > >>> }
> > > > > >>>
> > > > > >>> enable_c02_store()
> > > > > >>> {
> > > > > >>> mutex_lock(&umwait_lock);
> > > > > >>> umwait_control_c02 = (u32)!c02_enabled;
> > > > > >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > > > > >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> > > > > >>> mutex_unlock(&umwait_lock);
> > > > > >>> }
> > > > > >>>
> > > > > >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > > > > >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > > > > >>> onlining CPU1 in the same time:
> > > > > >>>
> > > > > >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > > >>> umwait_cpu_online()
> > > > > >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > > >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> > > > > >>>
> > > > > >>> The result is CPU0 and CPU1 have different MSR values.
> > > > > >>
> > > > > >> Yes, but only transiently, because you didn't finish your example.
> > > > > >>
> > > > > >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> > > > > >
> > > > > > There is no sync on wrmsr on CPU0 and CPU1.
> > > > >
> > > > > What do you mean by sync?
> > > > >
> > > > > > So a better sequence to
> > > > > > describe the problem is changing the order of wrmsr:
> > > > > >
> > > > > > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > > > umwait_cpu_online()
> > > > > > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > > > > > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > > >
> > > > > > So CPU1 and CPU0 have different MSR values. This won't be transient.
> > > > >
> > > > > You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
> > > > >
> > > >
> > > > Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
> > > > is going to be online:
> > > >
> > > > 1. On CPU1, cpu_online_mask=0x3 in start_secondary()
> > > > 2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
> > > > 3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > 4. On CPU0, execute one_each_cpu() in enabled_c02_store():
> > > > wrmsr with 100001 on CPU0
> > > > wrmsr with 100001 on CPU1
> > > > 5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > >
> > > > So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
> > > > the CPUs.
> > > >
> > > > Is this a right sequence to demonstrate locking issue without the mutex
> > > > locking?
> > > >
> > >
> > > Fair enough. I would fix it differently, though:
> > >
> > > static void update_this_cpu_umwait_msr(void)
> > > {
> > > WARN_ON_ONCE(!irqs_disabled()); /* or local_irq_save() */
> > >
> > > /* We need to prevent umwait_control from being changed *and*
> > > completing its WRMSR between our read and our WRMSR. By turning IRQs
> > > off here, we ensure that no sysfs write happens on this CPU and we
> > > also make sure that any concurrent sysfs write from a different CPU
> > > will not finish updating us via IPI until we're done. */
> > > wrmsrl(MSR_..., READ_ONCE(umwait_control), 0);
> > > }
> >
> > If no other objections, then I will keep the current mutex lock/unlock to
> > protect wrmsr and the umwait_control_cached variable.
> >
>
> I don't think that's sufficient. In your current code, you hold the
> mutex in some places and not in others, and there's no explanation.
The mutex is used in sysfs writing and cpu online.
But it's not used in syscore resume because only BP is running syscore
resume.
> And I think you're relying on the IRQs-off protection in at least one
> code path already, so you're not gaining any simplicity.
I don't rely on IRQs-off protection. I only use mutex to protect.
> At the very
> least, you need to add some extensive comments everywhere if you want
> to keep the mutex,
I have comment on why no need for mutex protection in syscore resume. But
I can add more comments on the locking.
> but I think it's simpler and clearer if you just
> use the same logic everywhere, for example, as I proposed above.
But using irqs_disabled() before wrmsr() and READ_ONCE/WRITE_ONCE for
umwait_control_cached alone are not sufficient. The mutex is still needed
to protect sysfs writing, is that right? Without mutex, one_each_cpu()
can write different values on CPUs, right?
If irqs disabling, READ_ONCE/WRITE_ONCE, and mutex are all used to protect,
isn't that more complex than just using mutex?
Thanks.
-Fenghua
On Mon, Jun 17, 2019 at 5:09 PM Fenghua Yu <[email protected]> wrote:
>
> On Mon, Jun 17, 2019 at 04:41:38PM -0700, Andy Lutomirski wrote:
> > On Mon, Jun 17, 2019 at 4:20 PM Fenghua Yu <[email protected]> wrote:
> > >
> > > On Mon, Jun 17, 2019 at 04:02:50PM -0700, Andy Lutomirski wrote:
> > > > On Mon, Jun 17, 2019 at 1:36 PM Fenghua Yu <[email protected]> wrote:
> > > > >
> > > > > On Mon, Jun 10, 2019 at 06:41:31AM -0700, Andy Lutomirski wrote:
> > > > > >
> > > > > >
> > > > > > > On Jun 9, 2019, at 11:02 PM, Fenghua Yu <[email protected]> wrote:
> > > > > > >
> > > > > > >> On Sun, Jun 09, 2019 at 09:24:18PM -0700, Andy Lutomirski wrote:
> > > > > > >>> On Sun, Jun 9, 2019 at 9:02 PM Fenghua Yu <[email protected]> wrote:
> > > > > > >>>
> > > > > > >>>> On Sat, Jun 08, 2019 at 03:50:32PM -0700, Andy Lutomirski wrote:
> > > > > > >>>>> On Fri, Jun 7, 2019 at 3:10 PM Fenghua Yu <[email protected]> wrote:
> > > > > > >>>>>
> > > > > > >>>>> C0.2 state in umwait and tpause instructions can be enabled or disabled
> > > > > > >>>>> on a processor through IA32_UMWAIT_CONTROL MSR register.
> > > > > > >>>>>
> > > > > > >>>>> By default, C0.2 is enabled and the user wait instructions result in
> > > > > > >>>>> lower power consumption with slower wakeup time.
> > > > > > >>>>>
> > > > > > >>>>> But in real time systems which require faster wakeup time although power
> > > > > > >>>>> savings could be smaller, the administrator needs to disable C0.2 and all
> > > > > > >>>>> C0.2 requests from user applications revert to C0.1.
> > > > > > >>>>>
> > > > > > >>>>> A sysfs interface "/sys/devices/system/cpu/umwait_control/enable_c02" is
> > > > > > >>>>> created to allow the administrator to control C0.2 state during run time.
> > > > > > >>>>
> > > > > > >>>> This looks better than the previous version. I think the locking is
> > > > > > >>>> still rather confused. You have a mutex that you hold while changing
> > > > > > >>>> the value, which is entirely reasonable. But, of the code paths that
> > > > > > >>>> write the MSR, only one takes the mutex.
> > > > > > >>>>
> > > > > > >>>> I think you should consider making a function that just does:
> > > > > > >>>>
> > > > > > >>>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > > > >>>>
> > > > > > >>>> and using it in all the places that update the MSR. The only thing
> > > > > > >>>> that should need the lock is the sysfs code to avoid accidentally
> > > > > > >>>> corrupting the value, but that code should also use WRITE_ONCE to do
> > > > > > >>>> its update.
> > > > > > >>>
> > > > > > >>> Based on the comment, the illustrative CPU online and enable_c02 store
> > > > > > >>> functions would be:
> > > > > > >>>
> > > > > > >>> umwait_cpu_online()
> > > > > > >>> {
> > > > > > >>> wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
> > > > > > >>> return 0;
> > > > > > >>> }
> > > > > > >>>
> > > > > > >>> enable_c02_store()
> > > > > > >>> {
> > > > > > >>> mutex_lock(&umwait_lock);
> > > > > > >>> umwait_control_c02 = (u32)!c02_enabled;
> > > > > > >>> WRITE_ONCE(umwait_control_cached, 2 | get_umwait_control_max_time());
> > > > > > >>> on_each_cpu(umwait_control_msr_update, NULL, 1);
> > > > > > >>> mutex_unlock(&umwait_lock);
> > > > > > >>> }
> > > > > > >>>
> > > > > > >>> Then suppose umwait_control_cached = 100000 initially and only CPU0 is
> > > > > > >>> running. Admin change bit 0 in MSR from 0 to 1 to disable C0.2 and is
> > > > > > >>> onlining CPU1 in the same time:
> > > > > > >>>
> > > > > > >>> 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > > > >>> umwait_cpu_online()
> > > > > > >>> 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > > >>> 3. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > > > >>> 4. On CPU0, wrmsr with 100001 in enabled_c02_store()
> > > > > > >>>
> > > > > > >>> The result is CPU0 and CPU1 have different MSR values.
> > > > > > >>
> > > > > > >> Yes, but only transiently, because you didn't finish your example.
> > > > > > >>
> > > > > > >> Step 5: enable_c02_store() does on_each_cpu(), and CPU 1 gets updated.
> > > > > > >
> > > > > > > There is no sync on wrmsr on CPU0 and CPU1.
> > > > > >
> > > > > > What do you mean by sync?
> > > > > >
> > > > > > > So a better sequence to
> > > > > > > describe the problem is changing the order of wrmsr:
> > > > > > >
> > > > > > > 1. On CPU1, read umwait_control_cached to eax as 100000 in
> > > > > > > umwait_cpu_online()
> > > > > > > 2. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > > > 3. On CPU0, wrmsr with 100001 in on_each_cpu() in enabled_c02_store()
> > > > > > > 4. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > > > >
> > > > > > > So CPU1 and CPU0 have different MSR values. This won't be transient.
> > > > > >
> > > > > > You are still ignoring the wrmsr on CPU1 due to on_each_cpu().
> > > > > >
> > > > >
> > > > > Initially umwait_control_cached is 100000 and CPU0 is online while CPU1
> > > > > is going to be online:
> > > > >
> > > > > 1. On CPU1, cpu_online_mask=0x3 in start_secondary()
> > > > > 2. On CPU1, read umwait_control_cached to eax as 100000 in umwait_cpu_online()
> > > > > 3. On CPU0, write 100001 to umwait_control_cached in enable_c02_store()
> > > > > 4. On CPU0, execute one_each_cpu() in enabled_c02_store():
> > > > > wrmsr with 100001 on CPU0
> > > > > wrmsr with 100001 on CPU1
> > > > > 5. On CPU1, wrmsr with eax=100000 in umwaint_cpu_online()
> > > > >
> > > > > So the MSR is 100000 on CPU1 and 100001 on CPU0. The MSRs are different on
> > > > > the CPUs.
> > > > >
> > > > > Is this a right sequence to demonstrate locking issue without the mutex
> > > > > locking?
> > > > >
> > > >
> > > > Fair enough. I would fix it differently, though:
> > > >
> > > > static void update_this_cpu_umwait_msr(void)
> > > > {
> > > > WARN_ON_ONCE(!irqs_disabled()); /* or local_irq_save() */
> > > >
> > > > /* We need to prevent umwait_control from being changed *and*
> > > > completing its WRMSR between our read and our WRMSR. By turning IRQs
> > > > off here, we ensure that no sysfs write happens on this CPU and we
> > > > also make sure that any concurrent sysfs write from a different CPU
> > > > will not finish updating us via IPI until we're done. */
> > > > wrmsrl(MSR_..., READ_ONCE(umwait_control), 0);
> > > > }
> > >
> > > If no other objections, then I will keep the current mutex lock/unlock to
> > > protect wrmsr and the umwait_control_cached variable.
> > >
> >
> > I don't think that's sufficient. In your current code, you hold the
> > mutex in some places and not in others, and there's no explanation.
>
> The mutex is used in sysfs writing and cpu online.
>
> But it's not used in syscore resume because only BP is running syscore
> resume.
>
> > And I think you're relying on the IRQs-off protection in at least one
> > code path already, so you're not gaining any simplicity.
>
> I don't rely on IRQs-off protection. I only use mutex to protect.
You're relying on being single-threaded in umwait_syscore_resume().
Do you actually know that's safe? You say it's because you're single
threaded, but what if you were suspended in the middle of a sysfs
operation? I think it's fine, but it needs an argument along the
lines of the argument for why the irqs disabled case is okay.
>
> > At the very
> > least, you need to add some extensive comments everywhere if you want
> > to keep the mutex,
>
> I have comment on why no need for mutex protection in syscore resume. But
> I can add more comments on the locking.
>
> > but I think it's simpler and clearer if you just
> > use the same logic everywhere, for example, as I proposed above.
>
> But using irqs_disabled() before wrmsr() and READ_ONCE/WRITE_ONCE for
> umwait_control_cached alone are not sufficient. The mutex is still needed
> to protect sysfs writing, is that right? Without mutex, one_each_cpu()
> can write different values on CPUs, right?
Yes, you probably need a mutex to prevent two sysfs writers from
clobbering each other.
>
> If irqs disabling, READ_ONCE/WRITE_ONCE, and mutex are all used to protect,
> isn't that more complex than just using mutex?
But you're already using a mutex and a comment. And you're hoping
that the syscore resume callback reads something sensible despite the
lack of READ_ONCE / WRITE_ONCE. The compiler is unlikely to butcher
this too badly, but still.
--Andy
On Mon, Jun 17, 2019 at 05:19:02PM -0700, Andy Lutomirski wrote:
> On Mon, Jun 17, 2019 at 5:09 PM Fenghua Yu <[email protected]> wrote:
> But you're already using a mutex and a comment. And you're hoping
> that the syscore resume callback reads something sensible despite the
> lack of READ_ONCE / WRITE_ONCE. The compiler is unlikely to butcher
> this too badly, but still.
You are right, syscore_resume will be wrong if suspend in middle of
sysfs writing.
Ok. I change this patch based on your proposed locking. Is this patch
right? Should I use WRITE_ONCE/READ_ONCE on each access of
umwait_control_cached?
Thanks.
-Fenghua
diff --git a/arch/x86/power/umwait.c b/arch/x86/power/umwait.c
index 9594af9f657e..d17572605c1a 100644
--- a/arch/x86/power/umwait.c
+++ b/arch/x86/power/umwait.c
@@ -11,10 +11,34 @@
*/
static u32 umwait_control_cached = 100000 & ~MSR_IA32_UMWAIT_CONTROL_C02_DISABLED;
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR
+ * in writing sysfs to ensure all CPUs have the same MSR value.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
+static void update_this_cpu_umwait_control_msr(void)
+{
+ unsigned long flags;
+
+ /*
+ * We need to prevent umwait_control_cached from being changed *and*
+ * completing its WRMSR between our read and our WRMSR. By turning
+ * IRQs off here, ensure that no sysfs write happens on this CPU
+ * and we also make sure that any concurrent sysfs write from a
+ * different CPU will not finish updating us via IPI until we're done.
+ */
+ local_irq_save(flags);
+
+ wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
+
+ local_irq_restore(flags);
+}
+
/* Set up IA32_UMWAIT_CONTROL MSR on CPU using the current global setting. */
static int umwait_cpu_online(unsigned int cpu)
{
- wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0);
+ update_this_cpu_umwait_control_msr();
return 0;
}
@@ -30,24 +54,102 @@ static int umwait_cpu_online(unsigned int cpu)
*/
static void umwait_syscore_resume(void)
{
- wrmsr(MSR_IA32_UMWAIT_CONTROL, umwait_control_cached, 0);
+ update_this_cpu_umwait_control_msr();
}
static struct syscore_ops umwait_syscore_ops = {
.resume = umwait_syscore_resume,
};
+static void umwait_control_msr_update(void *unused)
+{
+ update_this_cpu_umwait_control_msr();
+}
+
+static u32 get_umwait_control_c02(void)
+{
+ return READ_ONCE(umwait_control_cached) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLED;
+}
+
+static u32 get_umwait_control_max_time(void)
+{
+ return READ_ONCE(umwait_control_cached) & MSR_IA32_UMWAIT_CONTROL_MAX_TIME;
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ /*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled. Show the opposite of bit 0.
+ */
+ return sprintf(buf, "%d\n", !(bool)get_umwait_control_c02());
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 umwait_control_c02;
+ bool c02_enabled;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enabled);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ /*
+ * The value of bit 0 in IA32_UMWAIT_CONTROL MSR is opposite of
+ * c02_enabled.
+ */
+ umwait_control_c02 = (u32)!c02_enabled;
+ if (umwait_control_c02 == get_umwait_control_c02())
+ goto out_unlock;
+
+ WRITE_ONCE(umwait_control_cached, umwait_control_c02 | get_umwait_control_max_time());
+ /* Enable/disable C0.2 state on all CPUs */
+ on_each_cpu(umwait_control_msr_update, NULL, 1);
+
+out_unlock:
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
static int __init umwait_init(void)
{
+ struct device *dev;
int ret;
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
return -ENODEV;
+ /* Add umwait control interface. */
+ dev = cpu_subsys.dev_root;
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ if (ret)
+ return ret;
+
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait/intel:online",
umwait_cpu_online, NULL);
- if (ret < 0)
+ if (ret < 0) {
+ sysfs_remove_group(&dev->kobj, &umwait_attr_group);
+
return ret;
+ }
register_syscore_ops(&umwait_syscore_ops);
--
2.19.1