2023-05-23 20:17:28

by Tim Wiederhake

[permalink] [raw]
Subject: [PATCH 1/2] x86/msr: Read MSRs individually

Reading from /dev/cpu/*/msr with buffer size > 8 would read the data
of the same msr repeatedly instead of the data for consecutive msrs,
as one might expect.

Solve by restricting MSR reads to one per call.

Signed-off-by: Tim Wiederhake <[email protected]>
---
arch/x86/kernel/msr.c | 21 +++++++--------------
1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bb17d37db01..058f2b67d0c7 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -58,24 +58,17 @@ static ssize_t msr_read(struct file *file, char __user *buf,
u32 reg = *ppos;
int cpu = iminor(file_inode(file));
int err = 0;
- ssize_t bytes = 0;

- if (count % 8)
+ if (count < 8)
return -EINVAL; /* Invalid chunk size */

- for (; count; count -= 8) {
- err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
- if (err)
- break;
- if (copy_to_user(tmp, &data, 8)) {
- err = -EFAULT;
- break;
- }
- tmp += 2;
- bytes += 8;
- }
+ err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+ if (err)
+ return err;
+ if (copy_to_user(tmp, &data, 8))
+ return -EFAULT;

- return bytes ? bytes : err;
+ return 8;
}

static int filter_write(u32 reg)
--
2.39.2



2023-05-23 20:47:07

by Tim Wiederhake

[permalink] [raw]
Subject: [PATCH 2/2] x86/msr: Allow unprivileged read access to some MSRs

Delaying access control allows unprivileged processes to
read specific MSRs, such as IA32_CORE_CAPABILITIES and
IA32_ARCH_CAPABILITIES. This is helpful for e.g. qemu and
libvirt who require the raw MSR content to calculate host
CPU capabilities. Other programs might be interested in
IA32_EFER for x86-64-v1 detection.

Signed-off-by: Tim Wiederhake <[email protected]>
---
arch/x86/kernel/msr.c | 38 +++++++++++++++++++++++++++++++++-----
1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 058f2b67d0c7..9485aa7f8161 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -50,6 +50,23 @@ enum allow_write_msrs {

static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;

+static int filter_read(struct file *file, u32 reg)
+{
+ if (file->private_data)
+ return 0;
+
+ switch (reg) {
+ case MSR_IA32_CORE_CAPS:
+ case MSR_IA32_ARCH_CAPABILITIES:
+ case MSR_EFER:
+ return 0;
+ default:
+ break;
+ }
+
+ return -EPERM;
+}
+
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -59,6 +76,10 @@ static ssize_t msr_read(struct file *file, char __user *buf,
int cpu = iminor(file_inode(file));
int err = 0;

+ err = filter_read(file, reg);
+ if (err)
+ return err;
+
if (count < 8)
return -EINVAL; /* Invalid chunk size */

@@ -71,7 +92,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
return 8;
}

-static int filter_write(u32 reg)
+static int filter_write(struct file *file, u32 reg)
{
/*
* MSRs writes usually happen all at once, and can easily saturate kmsg.
@@ -83,6 +104,9 @@ static int filter_write(u32 reg)
*/
static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1);

+ if (!file->private_data)
+ return -EPERM;
+
switch (allow_writes) {
case MSR_WRITES_ON: return 0;
case MSR_WRITES_OFF: return -EPERM;
@@ -113,7 +137,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
if (err)
return err;

- err = filter_write(reg);
+ err = filter_write(file, reg);
if (err)
return err;

@@ -156,6 +180,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = -EFAULT;
break;
}
+ err = filter_read(file, regs[1]);
+ if (err)
+ return err;
err = rdmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
@@ -176,7 +203,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
if (err)
break;

- err = filter_write(regs[1]);
+ err = filter_write(file, regs[1]);
if (err)
return err;

@@ -202,8 +229,7 @@ static int msr_open(struct inode *inode, struct file *file)
unsigned int cpu = iminor(file_inode(file));
struct cpuinfo_x86 *c;

- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
+ file->private_data = (void *)(capable(CAP_SYS_RAWIO));

if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
@@ -245,6 +271,8 @@ static int msr_device_destroy(unsigned int cpu)

static char *msr_devnode(const struct device *dev, umode_t *mode)
{
+ if (mode)
+ *mode = 0644;
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}

--
2.39.2


2023-05-30 10:33:54

by Tim Wiederhake

[permalink] [raw]
Subject: [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs

Software such as qemu and libvirt require the raw content of some MSRs
to calculate host CPU capabilities. This is currently done through
/dev/cpu/*/msr which is locked behind both CAP_SYS_RAWIO and file mode
0600, allowing only root to read and write MSRs.

Expose some non-security sensitive MSRs through sysfs to allow access
for unprivileged processes. This also helps other programs that are
interested in IA32_EFER for x86-64-v1 detection.

Signed-off-by: Tim Wiederhake <[email protected]>
---
Changes to v1 (https://lkml.org/lkml/2023/5/23/1230):
* removed patch to limit reads to /dev/cpu/*/msr to 8 bytes per read
* removed CAP_SYS_RAWIO-less access to /dev/cpu/*/msr
* introduced sysfs interface to msrs

With this sysfs-based, unrestricted read access to some select msrs in
place, a later patch could introduce checks for CAP_SYS_RAWIO for every
access to /dev/cpu/*/msr as mentioned in the feedback to v1.
---
arch/x86/kernel/msr.c | 45 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bb17d37db01..3c8354f3c2bd 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -50,6 +50,31 @@ enum allow_write_msrs {

static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;

+struct allow_read_msrs {
+ const char *procname;
+ u32 index;
+ u32 value[2];
+};
+
+static struct allow_read_msrs allow_reads[] = {
+ {
+ .procname = "ia32_core_caps",
+ .index = MSR_IA32_CORE_CAPS,
+ },
+ {
+ .procname = "ia32_arch_capabilities",
+ .index = MSR_IA32_ARCH_CAPABILITIES,
+ },
+ {
+ .procname = "efer",
+ .index = MSR_EFER,
+ },
+};
+
+static struct ctl_table msr_files[ARRAY_SIZE(allow_reads) + 1];
+
+static struct ctl_table_header *msr_files_header;
+
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -258,6 +283,25 @@ static char *msr_devnode(const struct device *dev, umode_t *mode)
static int __init msr_init(void)
{
int err;
+ int i, j;
+
+ for (i = 0, j = 0; i < ARRAY_SIZE(allow_reads); ++i) {
+ err = rdmsr_safe_on_cpu(0, allow_reads[i].index,
+ &allow_reads[i].value[0],
+ &allow_reads[i].value[1]);
+ if (err)
+ continue;
+ msr_files[j].procname = allow_reads[i].procname;
+ msr_files[j].data = &allow_reads[i].value;
+ msr_files[j].maxlen = 2 * sizeof(u32);
+ msr_files[j].mode = 0444;
+ msr_files[j].proc_handler = proc_doulongvec_minmax;
+ ++j;
+ }
+
+ msr_files_header = register_sysctl("vm/msr", msr_files);
+ if (!msr_files_header)
+ return -ENOMEM;

if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
@@ -287,6 +331,7 @@ module_init(msr_init);

static void __exit msr_exit(void)
{
+ unregister_sysctl_table(msr_files_header);
cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
--
2.39.2


2023-05-30 17:10:27

by Jim Mattson

[permalink] [raw]
Subject: Re: [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs

On Tue, May 30, 2023 at 3:28 AM Tim Wiederhake <[email protected]> wrote:
>
> Software such as qemu and libvirt require the raw content of some MSRs

Note that KVM doesn't return the raw value of IA32_ARCH_CAPABILITIES.
First, it filters out unsupported bits, and then it massages the
result a bit. See kvm_get_arch_capabilities(). Isn't this what qemu
actually wants?

2023-05-30 17:29:57

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH v2] x86/msr: Allow unprivileged read access to some MSRs

On 5/30/23 03:23, Tim Wiederhake wrote:
> Expose some non-security sensitive MSRs through sysfs to allow access
> for unprivileged processes. This also helps other programs that are
> interested in IA32_EFER for x86-64-v1 detection.

Did you mean "sysfs" or "sysctl"?

I'm still on the fence about whether we should do this. This seems
_marginally_ better than the /dev approach.

But whatever we do we need some *VERY* explicit, tight rules about what
can be exposed via this interface in the future. We absolutely can't
have folks adding to this in the future without following those rules.

A lot of this is implicit in the implementation and even the ABI, but
let's say them out loud, please:

* The MSRs must be read-only. If they are read-write, the snapshot
can get out of date. This can be guaranteed by either:
* Never being written at runtime after they are snapshotted, or
preferably:
* Being defined to be read-only (wrmsr just doesn't work)
* The MSRs must be have the same exact value on all CPUs (because
there is only one file per MSR)
* The value must be static. Not only read-only from the software
point of view, but the hardware and hypervisor must also promise not
to change it.

The first two seem doable. I'm not sure how we deal with the third,
though, especially in the case of microcode updates or clever hypervisors.