2018-03-23 22:00:30

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule

I noticed high latencies caused by a daemon periodically reading
various MSR on all cpus. KASAN kernels would see ~10ms latencies
simply reading one MSR. Even without KASAN, sending IPI to CPU
in deep sleep state or blocking hard IRQ in a a long section,
then waiting for the answer can consume hundreds of usec.

Converts rdmsr_safe_on_cpu() to use a completion instead
of busy polling.

Overall daemon cpu usage was reduced by 35 %,
and latencies caused by msr_read() disappeared.

Signed-off-by: Eric Dumazet <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Hugh Dickins <[email protected]>
---
arch/x86/lib/msr-smp.c | 32 ++++++++++++++++++++++++--------
1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 693cce0be82dffb822cecd0c7e38d2821aff896c..761ba062afdaf7f7d0603ed94ed6cc3e46b37f76 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -2,6 +2,7 @@
#include <linux/export.h>
#include <linux/preempt.h>
#include <linux/smp.h>
+#include <linux/completion.h>
#include <asm/msr.h>

static void __rdmsr_on_cpu(void *info)
@@ -143,13 +144,19 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(wrmsr_on_cpus);

+struct msr_info_completion {
+ struct msr_info msr;
+ struct completion done;
+};
+
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
{
- struct msr_info *rv = info;
+ struct msr_info_completion *rv = info;

- rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+ rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h);
+ complete(&rv->done);
}

static void __wrmsr_safe_on_cpu(void *info)
@@ -161,17 +168,26 @@ static void __wrmsr_safe_on_cpu(void *info)

int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
+ struct msr_info_completion rv;
+ call_single_data_t csd = {
+ .func = __rdmsr_safe_on_cpu,
+ .info = &rv,
+ };
int err;
- struct msr_info rv;

memset(&rv, 0, sizeof(rv));
+ init_completion(&rv.done);
+ rv.msr.msr_no = msr_no;

- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.reg.l;
- *h = rv.reg.h;
+ err = smp_call_function_single_async(cpu, &csd);
+ if (!err) {
+ wait_for_completion(&rv.done);
+ err = rv.msr.err;
+ }
+ *l = rv.msr.reg.l;
+ *h = rv.msr.reg.h;

- return err ? err : rv.err;
+ return err;
}
EXPORT_SYMBOL(rdmsr_safe_on_cpu);

--
2.17.0.rc0.231.g781580f067-goog



2018-03-23 22:00:37

by Eric Dumazet

[permalink] [raw]
Subject: [PATCH v3 2/2] x86, cpuid: allow cpuid_read() to schedule

I noticed high latencies caused by a daemon periodically reading various
MSR and cpuid on all cpus. KASAN kernels would see ~10ms latencies
simply reading one cpuid. Even without KASAN, sending IPI to CPU
in deep sleep state or blocking hard IRQ in a a long section,
then waiting for the answer can consume hundreds of usec or more.

Switching to smp_call_function_single_async() and a completion
allows to reschedule and not burn cpu cycles.

Signed-off-by: Eric Dumazet <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Hugh Dickins <[email protected]>
---
arch/x86/kernel/cpuid.c | 34 ++++++++++++++++++++++++++--------
1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 0931a105ffe16cde4640e759efa600b23a756d84..1d300f96df4b316dbe3392c8221467cfd8593272 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
+#include <linux/completion.h>

#include <asm/processor.h>
#include <asm/msr.h>
@@ -47,19 +48,27 @@
static struct class *cpuid_class;
static enum cpuhp_state cpuhp_cpuid_state;

+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
+};
+
static void cpuid_smp_cpuid(void *cmd_block)
{
- struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
+ struct cpuid_regs_done *cmd = cmd_block;
+
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);

- cpuid_count(cmd->eax, cmd->ecx,
- &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
+ complete(&cmd->done);
}

static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
char __user *tmp = buf;
- struct cpuid_regs cmd;
+ struct cpuid_regs_done cmd;
int cpu = iminor(file_inode(file));
u64 pos = *ppos;
ssize_t bytes = 0;
@@ -68,19 +77,28 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
if (count % 16)
return -EINVAL; /* Invalid chunk size */

+ init_completion(&cmd.done);
for (; count; count -= 16) {
- cmd.eax = pos;
- cmd.ecx = pos >> 32;
- err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
+ call_single_data_t csd = {
+ .func = cpuid_smp_cpuid,
+ .info = &cmd,
+ };
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
if (err)
break;
- if (copy_to_user(tmp, &cmd, 16)) {
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
err = -EFAULT;
break;
}
tmp += 16;
bytes += 16;
*ppos = ++pos;
+ reinit_completion(&cmd.done);
}

return bytes ? bytes : err;
--
2.17.0.rc0.231.g781580f067-goog


2018-03-23 22:19:30

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] x86, cpuid: allow cpuid_read() to schedule

On 03/23/18 14:58, Eric Dumazet wrote:
> I noticed high latencies caused by a daemon periodically reading various
> MSR and cpuid on all cpus. KASAN kernels would see ~10ms latencies
> simply reading one cpuid. Even without KASAN, sending IPI to CPU
> in deep sleep state or blocking hard IRQ in a a long section,
> then waiting for the answer can consume hundreds of usec or more.
>
> Switching to smp_call_function_single_async() and a completion
> allows to reschedule and not burn cpu cycles.

That being said, the Right Way for a daemon to read multiple MSRs and
CPUIDs on multiple CPUs is to spawn a thread for each CPU and use CPU
affinity to lock them down. No IPI is needed to access MSRs on the
current CPU, and CPUID doesn't even need kernel entry.

-hpa

2018-03-24 01:02:23

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] x86, cpuid: allow cpuid_read() to schedule



On 03/23/2018 03:17 PM, H. Peter Anvin wrote:
> On 03/23/18 14:58, Eric Dumazet wrote:
>> I noticed high latencies caused by a daemon periodically reading various
>> MSR and cpuid on all cpus. KASAN kernels would see ~10ms latencies
>> simply reading one cpuid. Even without KASAN, sending IPI to CPU
>> in deep sleep state or blocking hard IRQ in a a long section,
>> then waiting for the answer can consume hundreds of usec or more.
>>
>> Switching to smp_call_function_single_async() and a completion
>> allows to reschedule and not burn cpu cycles.
>
> That being said, the Right Way for a daemon to read multiple MSRs and
> CPUIDs on multiple CPUs is to spawn a thread for each CPU and use CPU
> affinity to lock them down. No IPI is needed to access MSRs on the
> current CPU, and CPUID doesn't even need kernel entry.

Indeed, assuming a daemon can have threads running on all cpus :/

Some environments like to partition cpus for different jobs/containers.

Yes, we can avoid IPI by carefully re-designing these user programs.

2018-03-24 08:11:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule


* Eric Dumazet <[email protected]> wrote:

> I noticed high latencies caused by a daemon periodically reading
> various MSR on all cpus. KASAN kernels would see ~10ms latencies
> simply reading one MSR. Even without KASAN, sending IPI to CPU
> in deep sleep state or blocking hard IRQ in a a long section,
> then waiting for the answer can consume hundreds of usec.
>
> Converts rdmsr_safe_on_cpu() to use a completion instead
> of busy polling.
>
> Overall daemon cpu usage was reduced by 35 %,
> and latencies caused by msr_read() disappeared.

What "daemon" is this and why is it reading MSRs?

Thanks,

Ingo

2018-03-24 10:53:41

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule

On Sat, 24 Mar 2018, Ingo Molnar wrote:
> * Eric Dumazet <[email protected]> wrote:
>
> > I noticed high latencies caused by a daemon periodically reading
> > various MSR on all cpus. KASAN kernels would see ~10ms latencies
> > simply reading one MSR. Even without KASAN, sending IPI to CPU
> > in deep sleep state or blocking hard IRQ in a a long section,
> > then waiting for the answer can consume hundreds of usec.
> >
> > Converts rdmsr_safe_on_cpu() to use a completion instead
> > of busy polling.
> >
> > Overall daemon cpu usage was reduced by 35 %,
> > and latencies caused by msr_read() disappeared.
>
> What "daemon" is this and why is it reading MSRs?

Why? Just because it can ....

2018-03-24 14:31:00

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule



On 03/24/2018 01:09 AM, Ingo Molnar wrote:
>
> * Eric Dumazet <[email protected]> wrote:
>
>> I noticed high latencies caused by a daemon periodically reading
>> various MSR on all cpus. KASAN kernels would see ~10ms latencies
>> simply reading one MSR. Even without KASAN, sending IPI to CPU
>> in deep sleep state or blocking hard IRQ in a a long section,
>> then waiting for the answer can consume hundreds of usec.
>>
>> Converts rdmsr_safe_on_cpu() to use a completion instead
>> of busy polling.
>>
>> Overall daemon cpu usage was reduced by 35 %,
>> and latencies caused by msr_read() disappeared.
>
> What "daemon" is this and why is it reading MSRs?

It is named gsysd, "Google System Tool", a daemon+cli that is run
on all machines in production to provide a generic interface
for interacting with the system hardware.

I am not sure if this answers your question, I probably
could give a rough estimation of MWh this daemon consumes on the planet
if that helps.

Note that the source of the problem is not reading the MSR, but having cpus
blocking hard irqs for a long time.

Ingo, it looks like any loop protected by unlock_task_sighand() might be the main
offender.

Application writers seem to love getrusage() for example.
Can we rewrite it to not block hard irqs ?

Thanks !

2018-03-25 14:15:58

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule

On Sat, Mar 24, 2018 at 07:29:48AM -0700, Eric Dumazet wrote:
> It is named gsysd, "Google System Tool", a daemon+cli that is run
> on all machines in production to provide a generic interface
> for interacting with the system hardware.

So I'm wondering if poking at the hardware like that is a really optimal
design. Maybe it would be cleaner if the OS would provide properly
abstracted sysfs interfaces instead of raw MSRs. For a couple of
reasons:

* different vendors have different MSR ranges giving the same info so
instead of differentiating that in your daemon, we can do that nicely in
the kernel.

* exposing raw MSRs instead of having clearly defined sysfs files is
always a pain when a new CPU decides to change those MSRs. Hiding that
change in the OS is always easier.

* periodically polling MSRs which don't change that often is, of course,
wasting power and so reading a cached result is leaner.

* <another reason which I'll think of after hitting send... :\ >

In general, we should've never have had exposed that raw MSR access but
it is too late now - that ship has sailed. We can still try to design
new interfaces more cleanly, though.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2018-03-25 22:15:19

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] x86, cpuid: allow cpuid_read() to schedule

On 03/23/18 18:01, Eric Dumazet wrote:
>
> Indeed, assuming a daemon can have threads running on all cpus :/
>
> Some environments like to partition cpus for different jobs/containers.
>

Then it doesn't need to run it on all cpus... just the one that that
particular daemon is "responsible" for.

-hpa

2018-03-26 01:23:22

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule

On 03/25/18 07:12, Borislav Petkov wrote:
>
> So I'm wondering if poking at the hardware like that is a really optimal
> design. Maybe it would be cleaner if the OS would provide properly
> abstracted sysfs interfaces instead of raw MSRs. For a couple of
> reasons:
>

It's most definitely not.

-hpa

2018-03-26 06:42:02

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule


* Borislav Petkov <[email protected]> wrote:

> On Sat, Mar 24, 2018 at 07:29:48AM -0700, Eric Dumazet wrote:
> > It is named gsysd, "Google System Tool", a daemon+cli that is run
> > on all machines in production to provide a generic interface
> > for interacting with the system hardware.
>
> So I'm wondering if poking at the hardware like that is a really optimal
> design. Maybe it would be cleaner if the OS would provide properly
> abstracted sysfs interfaces instead of raw MSRs.

It's not ideal to read /dev/msr.

A SysFS interface to enumerate 'system statistics' MSRs already exists via perf,
under:

/sys/bus/event_source/devices/msr/events/

This is for simple platform specific hardware statistics counters.

This is implemented in arch/x86/events/msr.c, with a number of MSRs already
abstracted out:

enum perf_msr_id {
PERF_MSR_TSC = 0,
PERF_MSR_APERF = 1,
PERF_MSR_MPERF = 2,
PERF_MSR_PPERF = 3,
PERF_MSR_SMI = 4,
PERF_MSR_PTSC = 5,
PERF_MSR_IRPERF = 6,
PERF_MSR_THERM = 7,
PERF_MSR_THERM_SNAP = 8,
PERF_MSR_THERM_UNIT = 9,
PERF_MSR_EVENT_MAX,
};

It's very easy to add new MSRs:

9ae21dd66b97: perf/x86/msr: Add support for MSR_IA32_THERM_STATUS
aaf248848db5: perf/x86/msr: Add AMD IRPERF (Instructions Retired) performance counter
8a2242618477: perf/x86/msr: Add AMD PTSC (Performance Time-Stamp Counter) support

This commit added an MSR to msr-index and added MSR event support for it as well
with proper CPU-ID dependent enumeration:

8a2242618477: perf/x86/msr: Add AMD PTSC (Performance Time-Stamp Counter) support

arch/x86/events/msr.c | 8 ++++++++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/msr-index.h | 1 +
3 files changed, 10 insertions(+)

More complex MSR value encodings are supported as well - see for example
MSR_IA32_THERM_STATUS, which is encoded as:

/* if valid, extract digital readout, other set to -1 */
now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1;
local64_set(&event->count, now);

If an MSR counter is a plain integer counter then no such code has to be added.

Only those MSRs that are valid on a system are 'live', and thus tooling can
enumerate them programmatically and has easy symbolic access to them:

galatea:~> perf list | grep msr/

msr/aperf/ [Kernel PMU event]
msr/cpu_thermal_margin/ [Kernel PMU event]
msr/mperf/ [Kernel PMU event]
msr/smi/ [Kernel PMU event]
msr/tsc/ [Kernel PMU event]

These can be used as simple counters that are read - and it's using perf not
/dev/msr so they are fast and scalable - but the full set of perf features is also
available, so we can do:

galatea:~> perf stat -a -e msr/aperf/ sleep 1

Performance counter stats for 'system wide':

354,442,500 msr/aperf/

1.001918252 seconds time elapsed

This interface is vastly superior to /dev/msr: it does not have the various
usability, scalability and security limitations of /dev/msr.

/dev/msr is basically for debugging.

If performance matters then the relevant MSR events should be added and gsysd
should be updated to support MSR events.

Thanks,

Ingo

2018-03-26 14:24:30

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule

On Mon, 26 Mar 2018, Eric Dumazet wrote:
> On Sun, Mar 25, 2018 at 11:40 PM Ingo Molnar <[email protected]> wrote:
> > If performance matters then the relevant MSR events should be added and
> > gsysd
> > should be updated to support MSR events.
> >
>
> Thanks for the hints Ingo. I have forwarded them.
>
> I am pretty sure gsysd was designed and written years before perf even
> existed,
> so I am not sure all this will ever be implemented.
>
> I guess I will apply the kernel patches locally for the time being.

I'm still considering to merge them because there is no real reason not to
do so. The interfaces exist and are used. Reducing their impact is
certainly worthwhile.

Thanks,

tglx

2018-03-27 09:38:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] x86, msr: allow rdmsr_safe_on_cpu() to schedule


* Thomas Gleixner <[email protected]> wrote:

> On Mon, 26 Mar 2018, Eric Dumazet wrote:
> > On Sun, Mar 25, 2018 at 11:40 PM Ingo Molnar <[email protected]> wrote:
> > > If performance matters then the relevant MSR events should be added and
> > > gsysd
> > > should be updated to support MSR events.
> > >
> >
> > Thanks for the hints Ingo. I have forwarded them.
> >
> > I am pretty sure gsysd was designed and written years before perf even
> > existed,
> > so I am not sure all this will ever be implemented.
> >
> > I guess I will apply the kernel patches locally for the time being.
>
> I'm still considering to merge them because there is no real reason not to
> do so. The interfaces exist and are used. Reducing their impact is
> certainly worthwhile.

Ok, that's a good point, and the patches themselves are sane:

Acked-by: Ingo Molnar <[email protected]>

Thanks,

Ingo

Subject: [tip:x86/cleanups] x86/msr: Allow rdmsr_safe_on_cpu() to schedule

Commit-ID: 07cde313b2d21f728cec2836db7cdb55476f7a26
Gitweb: https://git.kernel.org/tip/07cde313b2d21f728cec2836db7cdb55476f7a26
Author: Eric Dumazet <[email protected]>
AuthorDate: Fri, 23 Mar 2018 14:58:17 -0700
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 27 Mar 2018 12:01:47 +0200

x86/msr: Allow rdmsr_safe_on_cpu() to schedule

High latencies can be observed caused by a daemon periodically reading
various MSR on all cpus. On KASAN enabled kernels ~10ms latencies can be
observed simply reading one MSR. Even without KASAN, sending an IPI to a
CPU, which is in a deep sleep state or in a long hard IRQ disabled section,
waiting for the answer can consume hundreds of microseconds.

All usage sites are in preemptible context, convert rdmsr_safe_on_cpu() to
use a completion instead of busy polling.

Overall daemon cpu usage was reduced by 35 %, and latencies caused by
msr_read() disappeared.

Signed-off-by: Eric Dumazet <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Eric Dumazet <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/lib/msr-smp.c | 32 ++++++++++++++++++++++++--------
1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 693cce0be82d..761ba062afda 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -2,6 +2,7 @@
#include <linux/export.h>
#include <linux/preempt.h>
#include <linux/smp.h>
+#include <linux/completion.h>
#include <asm/msr.h>

static void __rdmsr_on_cpu(void *info)
@@ -143,13 +144,19 @@ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
}
EXPORT_SYMBOL(wrmsr_on_cpus);

+struct msr_info_completion {
+ struct msr_info msr;
+ struct completion done;
+};
+
/* These "safe" variants are slower and should be used when the target MSR
may not actually exist. */
static void __rdmsr_safe_on_cpu(void *info)
{
- struct msr_info *rv = info;
+ struct msr_info_completion *rv = info;

- rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
+ rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h);
+ complete(&rv->done);
}

static void __wrmsr_safe_on_cpu(void *info)
@@ -161,17 +168,26 @@ static void __wrmsr_safe_on_cpu(void *info)

int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
{
+ struct msr_info_completion rv;
+ call_single_data_t csd = {
+ .func = __rdmsr_safe_on_cpu,
+ .info = &rv,
+ };
int err;
- struct msr_info rv;

memset(&rv, 0, sizeof(rv));
+ init_completion(&rv.done);
+ rv.msr.msr_no = msr_no;

- rv.msr_no = msr_no;
- err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
- *l = rv.reg.l;
- *h = rv.reg.h;
+ err = smp_call_function_single_async(cpu, &csd);
+ if (!err) {
+ wait_for_completion(&rv.done);
+ err = rv.msr.err;
+ }
+ *l = rv.msr.reg.l;
+ *h = rv.msr.reg.h;

- return err ? err : rv.err;
+ return err;
}
EXPORT_SYMBOL(rdmsr_safe_on_cpu);


Subject: [tip:x86/cleanups] x86/cpuid: Allow cpuid_read() to schedule

Commit-ID: 67bbd7a8d6bcdc44cc27105ae8c374e9176ceaf1
Gitweb: https://git.kernel.org/tip/67bbd7a8d6bcdc44cc27105ae8c374e9176ceaf1
Author: Eric Dumazet <[email protected]>
AuthorDate: Fri, 23 Mar 2018 14:58:18 -0700
Committer: Thomas Gleixner <[email protected]>
CommitDate: Tue, 27 Mar 2018 12:01:48 +0200

x86/cpuid: Allow cpuid_read() to schedule

High latencies can be observed caused by a daemon periodically reading
CPUID on all cpus. On KASAN enabled kernels ~10ms latencies can be
observed. Even without KASAN, sending an IPI to a CPU, which is in a deep
sleep state or in a long hard IRQ disabled section, waiting for the answer
can consume hundreds of microseconds.

cpuid_read() is invoked in preemptible context, so it can be converted to
sleep instead of busy wait.

Switching to smp_call_function_single_async() and a completion allows to
reschedule and reduces CPU usage and latencies.

Signed-off-by: Eric Dumazet <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Acked-by: Ingo Molnar <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Eric Dumazet <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/kernel/cpuid.c | 34 ++++++++++++++++++++++++++--------
1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 0931a105ffe1..1d300f96df4b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
#include <linux/notifier.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
+#include <linux/completion.h>

#include <asm/processor.h>
#include <asm/msr.h>
@@ -47,19 +48,27 @@
static struct class *cpuid_class;
static enum cpuhp_state cpuhp_cpuid_state;

+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
+};
+
static void cpuid_smp_cpuid(void *cmd_block)
{
- struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
+ struct cpuid_regs_done *cmd = cmd_block;
+
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);

- cpuid_count(cmd->eax, cmd->ecx,
- &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
+ complete(&cmd->done);
}

static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
char __user *tmp = buf;
- struct cpuid_regs cmd;
+ struct cpuid_regs_done cmd;
int cpu = iminor(file_inode(file));
u64 pos = *ppos;
ssize_t bytes = 0;
@@ -68,19 +77,28 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
if (count % 16)
return -EINVAL; /* Invalid chunk size */

+ init_completion(&cmd.done);
for (; count; count -= 16) {
- cmd.eax = pos;
- cmd.ecx = pos >> 32;
- err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
+ call_single_data_t csd = {
+ .func = cpuid_smp_cpuid,
+ .info = &cmd,
+ };
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
if (err)
break;
- if (copy_to_user(tmp, &cmd, 16)) {
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
err = -EFAULT;
break;
}
tmp += 16;
bytes += 16;
*ppos = ++pos;
+ reinit_completion(&cmd.done);
}

return bytes ? bytes : err;