From: Or Idgar <[email protected]>
This patch is a driver which expose the Virtual Machine Generation ID
via sysfs. The ID is a UUID value used to differentiate between virtual
machines.
The VM-Generation ID is a feature defined by Microsoft (paper:
http://go.microsoft.com/fwlink/?LinkId=260709) and supported by multiple
hypervisor vendors.
Signed-off-by: Or Idgar <[email protected]>
---
Changes in v5:
- added to VMGENID module dependency on ACPI module.
Documentation/ABI/testing/sysfs-hypervisor | 13 +++
drivers/misc/Kconfig | 7 ++
drivers/misc/Makefile | 1 +
drivers/misc/vmgenid.c | 142 +++++++++++++++++++++++++++++
4 files changed, 163 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-hypervisor
create mode 100644 drivers/misc/vmgenid.c
diff --git a/Documentation/ABI/testing/sysfs-hypervisor b/Documentation/ABI/testing/sysfs-hypervisor
new file mode 100644
index 000000000000..2f9a7b8eab70
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-hypervisor
@@ -0,0 +1,13 @@
+What: /sys/hypervisor/vm_gen_counter
+Date: February 2018
+Contact: Or Idgar <[email protected]>
+ Gal Hammer <[email protected]>
+Description: Expose the virtual machine generation ID. The directory
+ contains two files: "generation_id" and "raw". Both files
+ represent the same information.
+
+ "generation_id" file is a UUID string
+ representation.
+
+ "raw" file is a 128-bit integer
+ representation (binary).
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 03605f8fc0dc..a39feff6a867 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -500,6 +500,13 @@ config MISC_RTSX
tristate
default MISC_RTSX_PCI || MISC_RTSX_USB
+config VMGENID
+ depends on ACPI
+ tristate "Virtual Machine Generation ID driver"
+ help
+ This is a Virtual Machine Generation ID driver which provides
+ a virtual machine unique identifier.
+
source "drivers/misc/c2port/Kconfig"
source "drivers/misc/eeprom/Kconfig"
source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index c3c8624f4d95..067aa666bb6a 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP) += aspeed-lpc-snoop.o
obj-$(CONFIG_PCI_ENDPOINT_TEST) += pci_endpoint_test.o
obj-$(CONFIG_OCXL) += ocxl/
obj-$(CONFIG_MISC_RTSX) += cardreader/
+obj-$(CONFIG_VMGENID) += vmgenid.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
diff --git a/drivers/misc/vmgenid.c b/drivers/misc/vmgenid.c
new file mode 100644
index 000000000000..6c8d8fe75335
--- /dev/null
+++ b/drivers/misc/vmgenid.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual Machine Generation ID driver
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
+ * Authors:
+ * Or Idgar <[email protected]>
+ * Gal Hammer <[email protected]>
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/acpi.h>
+#include <linux/uuid.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Or Idgar <[email protected]>");
+MODULE_AUTHOR("Gal Hammer <[email protected]>");
+MODULE_DESCRIPTION("Virtual Machine Generation ID");
+MODULE_VERSION("0.1");
+
+ACPI_MODULE_NAME("vmgenid");
+
+static u64 phys_addr;
+
+static ssize_t generation_id_show(struct device *_d,
+ struct device_attribute *attr, char *buf)
+{
+ void __iomem *uuid_map;
+ uuid_t uuid;
+ ssize_t result;
+
+ uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
+ if (!uuid_map)
+ return -EFAULT;
+
+ memcpy_fromio(&uuid, uuid_map, sizeof(uuid_t));
+ result = sprintf(buf, "%pUl\n", &uuid);
+ acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
+ return result;
+}
+static DEVICE_ATTR_RO(generation_id);
+
+static ssize_t raw_show(struct device *_d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ void __iomem *uuid_map;
+
+ uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
+ if (!uuid_map)
+ return -EFAULT;
+ memcpy_fromio(buf, uuid_map, sizeof(uuid_t));
+ acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
+ return sizeof(uuid_t);
+}
+static DEVICE_ATTR_RO(raw);
+
+static struct attribute *vmgenid_attrs[] = {
+ &dev_attr_generation_id.attr,
+ &dev_attr_raw.attr,
+ NULL,
+};
+static const struct attribute_group vmgenid_group = {
+ .name = "vm_gen_counter",
+ .attrs = vmgenid_attrs,
+};
+
+static int get_vmgenid(acpi_handle handle)
+{
+ int i;
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ acpi_status status;
+ union acpi_object *pss;
+ union acpi_object *element;
+
+ status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer);
+ if (ACPI_FAILURE(status)) {
+ ACPI_EXCEPTION((AE_INFO, status, "Evaluating _ADDR"));
+ return -ENODEV;
+ }
+ pss = buffer.pointer;
+ if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2)
+ return -EFAULT;
+
+ phys_addr = 0;
+ for (i = 0; i < pss->package.count; i++) {
+ element = &(pss->package.elements[i]);
+ if (element->type != ACPI_TYPE_INTEGER)
+ return -EFAULT;
+ phys_addr |= element->integer.value << i*32;
+ }
+ return 0;
+}
+
+static int acpi_vmgenid_add(struct acpi_device *device)
+{
+ int retval;
+
+ if (!device)
+ return -EINVAL;
+ retval = get_vmgenid(device->handle);
+ if (retval < 0)
+ return retval;
+ return sysfs_create_group(hypervisor_kobj, &vmgenid_group);
+}
+
+static int acpi_vmgenid_remove(struct acpi_device *device)
+{
+ sysfs_remove_group(hypervisor_kobj, &vmgenid_group);
+ return 0;
+}
+
+static const struct acpi_device_id vmgenid_ids[] = {
+ {"QEMUVGID", 0},
+ {"", 0},
+};
+
+static struct acpi_driver acpi_vmgenid_driver = {
+ .name = "vm_gen_counter",
+ .ids = vmgenid_ids,
+ .owner = THIS_MODULE,
+ .ops = {
+ .add = acpi_vmgenid_add,
+ .remove = acpi_vmgenid_remove,
+ }
+};
+
+static int __init vmgenid_init(void)
+{
+ return acpi_bus_register_driver(&acpi_vmgenid_driver);
+}
+
+static void __exit vmgenid_exit(void)
+{
+ acpi_bus_unregister_driver(&acpi_vmgenid_driver);
+}
+
+module_init(vmgenid_init);
+module_exit(vmgenid_exit);
--
2.14.3
Thanks for the patch! Yet something to improve (see below):
On Thu, Mar 01, 2018 at 04:22:15PM +0200, Or Idgar wrote:
> From: Or Idgar <[email protected]>
I see addresses at gmail, virtualoco and redhat.com At this point I
don't really know which address to use to contact you :) All of them?
Also, I think it's a good idea to CC this more widely. Consider CC-ing
qemu contributors to the vm gen id (use git log to get the list), Ben
Warren who wrote a prototype driver a while ago, qemu mailing list,
maybe more.
>
> This patch is a driver which expose the Virtual Machine Generation ID
> via sysfs.
>
> The ID is a UUID value used to differentiate between virtual
> machines.
>
> The VM-Generation ID is a feature defined by Microsoft (paper:
> http://go.microsoft.com/fwlink/?LinkId=260709) and supported by multiple
> hypervisor vendors.
>
> Signed-off-by: Or Idgar <[email protected]>
I think it's a good idea to use sysfs for this. However,
there are a couple of missing interfaces here:
1. Userspace needs a way to know when this value changes.
I see no change notifications here and that does not seem right.
2. Userspace needs to be able to read these without
system calls. Pls add mmap support to the raw format.
(Phys address is not guaranteed to be page-aligned so you will
probably want an offset attribute for that as well).
While it's possible to add these later in theory, it's
easier if userspace can rely on all interfaces being
in place just by detecting the directory presence.
> ---
>
> Changes in v5:
> - added to VMGENID module dependency on ACPI module.
>
> Documentation/ABI/testing/sysfs-hypervisor | 13 +++
> drivers/misc/Kconfig | 7 ++
> drivers/misc/Makefile | 1 +
> drivers/misc/vmgenid.c | 142 +++++++++++++++++++++++++++++
Do you want to add this under QEMU MACHINE EMULATOR in MAINTAINERS?
This way e.g. qemu-devel will be copied.
> 4 files changed, 163 insertions(+)
> create mode 100644 Documentation/ABI/testing/sysfs-hypervisor
> create mode 100644 drivers/misc/vmgenid.c
>
> diff --git a/Documentation/ABI/testing/sysfs-hypervisor b/Documentation/ABI/testing/sysfs-hypervisor
> new file mode 100644
> index 000000000000..2f9a7b8eab70
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-hypervisor
> @@ -0,0 +1,13 @@
> +What: /sys/hypervisor/vm_gen_counter
It's not a counter, is it?
I'd go with "vm_generation_id" here. Eschew abbreviation.
> +Date: February 2018
> +Contact: Or Idgar <[email protected]>
> + Gal Hammer <[email protected]>
> +Description: Expose the virtual machine generation ID. The directory
> + contains two files: "generation_id" and "raw". Both files
> + represent the same information.
> +
> + "generation_id" file is a UUID string
And this I'd call "uuid" then.
> + representation.
> +
> + "raw" file is a 128-bit integer
> + representation (binary).
> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> index 03605f8fc0dc..a39feff6a867 100644
> --- a/drivers/misc/Kconfig
> +++ b/drivers/misc/Kconfig
> @@ -500,6 +500,13 @@ config MISC_RTSX
> tristate
> default MISC_RTSX_PCI || MISC_RTSX_USB
>
> +config VMGENID
> + depends on ACPI
> + tristate "Virtual Machine Generation ID driver"
> + help
> + This is a Virtual Machine Generation ID driver which provides
> + a virtual machine unique identifier.
> +
> source "drivers/misc/c2port/Kconfig"
> source "drivers/misc/eeprom/Kconfig"
> source "drivers/misc/cb710/Kconfig"
> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> index c3c8624f4d95..067aa666bb6a 100644
> --- a/drivers/misc/Makefile
> +++ b/drivers/misc/Makefile
> @@ -57,6 +57,7 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP) += aspeed-lpc-snoop.o
> obj-$(CONFIG_PCI_ENDPOINT_TEST) += pci_endpoint_test.o
> obj-$(CONFIG_OCXL) += ocxl/
> obj-$(CONFIG_MISC_RTSX) += cardreader/
> +obj-$(CONFIG_VMGENID) += vmgenid.o
>
> lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
> lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
> diff --git a/drivers/misc/vmgenid.c b/drivers/misc/vmgenid.c
> new file mode 100644
> index 000000000000..6c8d8fe75335
> --- /dev/null
> +++ b/drivers/misc/vmgenid.c
> @@ -0,0 +1,142 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Virtual Machine Generation ID driver
> + *
> + * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
> + * Authors:
> + * Or Idgar <[email protected]>
> + * Gal Hammer <[email protected]>
> + *
> + */
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/kobject.h>
> +#include <linux/acpi.h>
> +#include <linux/uuid.h>
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Or Idgar <[email protected]>");
> +MODULE_AUTHOR("Gal Hammer <[email protected]>");
> +MODULE_DESCRIPTION("Virtual Machine Generation ID");
> +MODULE_VERSION("0.1");
> +
> +ACPI_MODULE_NAME("vmgenid");
> +
> +static u64 phys_addr;
> +
> +static ssize_t generation_id_show(struct device *_d,
> + struct device_attribute *attr, char *buf)
> +{
> + void __iomem *uuid_map;
> + uuid_t uuid;
> + ssize_t result;
> +
> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
> + if (!uuid_map)
> + return -EFAULT;
Shouldn't this be acpi_os_map_memory? Spec says it's never an IO address.
> +
> + memcpy_fromio(&uuid, uuid_map, sizeof(uuid_t));
> + result = sprintf(buf, "%pUl\n", &uuid);
> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
> + return result;
> +}
> +static DEVICE_ATTR_RO(generation_id);
> +
> +static ssize_t raw_show(struct device *_d,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + void __iomem *uuid_map;
> +
> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
> + if (!uuid_map)
> + return -EFAULT;
> + memcpy_fromio(buf, uuid_map, sizeof(uuid_t));
> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
> + return sizeof(uuid_t);
> +}
> +static DEVICE_ATTR_RO(raw);
I think you want BIN_ATTR_RO.
> +
> +static struct attribute *vmgenid_attrs[] = {
> + &dev_attr_generation_id.attr,
> + &dev_attr_raw.attr,
> + NULL,
> +};
> +static const struct attribute_group vmgenid_group = {
> + .name = "vm_gen_counter",
> + .attrs = vmgenid_attrs,
> +};
> +
> +static int get_vmgenid(acpi_handle handle)
> +{
> + int i;
> + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
> + acpi_status status;
> + union acpi_object *pss;
> + union acpi_object *element;
> +
> + status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer);
> + if (ACPI_FAILURE(status)) {
> + ACPI_EXCEPTION((AE_INFO, status, "Evaluating _ADDR"));
It's ADDR - not _ADDR, right?
> + return -ENODEV;
> + }
> + pss = buffer.pointer;
> + if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2)
> + return -EFAULT;
> +
> + phys_addr = 0;
> + for (i = 0; i < pss->package.count; i++) {
> + element = &(pss->package.elements[i]);
> + if (element->type != ACPI_TYPE_INTEGER)
> + return -EFAULT;
EINVAL here and elsewhere.
> + phys_addr |= element->integer.value << i*32;
i * 32
> + }
> + return 0;
> +}
> +
> +static int acpi_vmgenid_add(struct acpi_device *device)
> +{
> + int retval;
> +
> + if (!device)
> + return -EINVAL;
> + retval = get_vmgenid(device->handle);
> + if (retval < 0)
> + return retval;
> + return sysfs_create_group(hypervisor_kobj, &vmgenid_group);
> +}
> +
> +static int acpi_vmgenid_remove(struct acpi_device *device)
> +{
> + sysfs_remove_group(hypervisor_kobj, &vmgenid_group);
> + return 0;
> +}
> +
> +static const struct acpi_device_id vmgenid_ids[] = {
> + {"QEMUVGID", 0},
> + {"", 0},
> +};
That's not right I think. You should go by _CID and maybe
_DDN.
>
> +
> +static struct acpi_driver acpi_vmgenid_driver = {
> + .name = "vm_gen_counter",
> + .ids = vmgenid_ids,
> + .owner = THIS_MODULE,
> + .ops = {
> + .add = acpi_vmgenid_add,
> + .remove = acpi_vmgenid_remove,
> + }
> +};
> +
> +static int __init vmgenid_init(void)
> +{
> + return acpi_bus_register_driver(&acpi_vmgenid_driver);
> +}
> +
> +static void __exit vmgenid_exit(void)
> +{
> + acpi_bus_unregister_driver(&acpi_vmgenid_driver);
> +}
> +
> +module_init(vmgenid_init);
> +module_exit(vmgenid_exit);
Thanks for your work, and I hope this helps!
--
MST
On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
> I think it's a good idea to use sysfs for this. However,
> there are a couple of missing interfaces here:
>
> 1. Userspace needs a way to know when this value changes.
> I see no change notifications here and that does not seem right.
How can these change?
> 2. Userspace needs to be able to read these without
> system calls.
Ick, what? Why not?
> Pls add mmap support to the raw format.
For a single integer? Why do you need mmap for this? What is so
"performant" that needs to touch a sysfs file?
> (Phys address is not guaranteed to be page-aligned so you will
> probably want an offset attribute for that as well).
Ick ick ick, that's why it's good to just stick with a sysfs file.
Have you tested just how long this takes to see if the open/read/close
is really the bottleneck, or if the io on reading the value is the
bottleneck?
thanks,
greg k-h
On Wed, Mar 14, 2018 at 07:25:36PM +0100, Greg KH wrote:
> On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
> > I think it's a good idea to use sysfs for this. However,
> > there are a couple of missing interfaces here:
> >
> > 1. Userspace needs a way to know when this value changes.
> > I see no change notifications here and that does not seem right.
>
> How can these change?
It's a hardware register. It changes when hardware feels like it :)
In particular, it changes whenever VM is migrated or snapshotted.
> > 2. Userspace needs to be able to read these without
> > system calls.
>
> Ick, what? Why not?
>
> > Pls add mmap support to the raw format.
>
> For a single integer? Why do you need mmap for this? What is so
> "performant" that needs to touch a sysfs file?
> > (Phys address is not guaranteed to be page-aligned so you will
> > probably want an offset attribute for that as well).
>
> Ick ick ick, that's why it's good to just stick with a sysfs file.
>
> Have you tested just how long this takes to see if the open/read/close
> is really the bottleneck, or if the io on reading the value is the
> bottleneck?
>
> thanks,
>
> greg k-h
Well an application needs to check this value basically after
every database transaction. So I'm pretty sure it's a performance
sensitive path. But yes, I didn't profile any apps since they
are yet to be written to use this interface.
I'm fine deferring point 2 for now.
--
MST
On Wed, Mar 14, 2018 at 9:25 PM, Michael S. Tsirkin <[email protected]> wrote:
> On Wed, Mar 14, 2018 at 07:25:36PM +0100, Greg KH wrote:
>> On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
>> > I think it's a good idea to use sysfs for this. However,
>> > there are a couple of missing interfaces here:
>> >
>> > 1. Userspace needs a way to know when this value changes.
>> > I see no change notifications here and that does not seem right.
>>
>> How can these change?
>
> It's a hardware register. It changes when hardware feels like it :)
> In particular, it changes whenever VM is migrated or snapshotted.
This value doesn't change when a VM is migrated. It is unlikely that
this value will be changed so frequently that a direct access to the
memory is required. Even in QEMU, the current implementation was
merged without an option to change the generation id on-the-fly. One
must run a new instance in order to set a new value, which means that
no application will be running during that time.
>> > 2. Userspace needs to be able to read these without
>> > system calls.
>>
>> Ick, what? Why not?
>>
>> > Pls add mmap support to the raw format.
>>
>> For a single integer? Why do you need mmap for this? What is so
>> "performant" that needs to touch a sysfs file?
>> > (Phys address is not guaranteed to be page-aligned so you will
>> > probably want an offset attribute for that as well).
>>
>> Ick ick ick, that's why it's good to just stick with a sysfs file.
I agree with Greg here. The user is able to read the value, and then
wait for a notification if she cares about changes.
>> Have you tested just how long this takes to see if the open/read/close
>> is really the bottleneck, or if the io on reading the value is the
>> bottleneck?
>>
>> thanks,
>>
>> greg k-h
>
> Well an application needs to check this value basically after
> every database transaction. So I'm pretty sure it's a performance
> sensitive path. But yes, I didn't profile any apps since they
> are yet to be written to use this interface.
> I'm fine deferring point 2 for now.
>
> --
> MST
Thanks,
Gal.
On Tue, Mar 13, 2018 at 7:40 PM, Michael S. Tsirkin <[email protected]> wrote:
> Thanks for the patch! Yet something to improve (see below):
Thanks for the review.
> On Thu, Mar 01, 2018 at 04:22:15PM +0200, Or Idgar wrote:
>> From: Or Idgar <[email protected]>
>
> I see addresses at gmail, virtualoco and redhat.com At this point I
> don't really know which address to use to contact you :) All of them?
Use his gmail or virtualoco one.
> Also, I think it's a good idea to CC this more widely. Consider CC-ing
> qemu contributors to the vm gen id (use git log to get the list), Ben
> Warren who wrote a prototype driver a while ago, qemu mailing list,
> maybe more.
>
>>
>> This patch is a driver which expose the Virtual Machine Generation ID
>> via sysfs.
>>
>> The ID is a UUID value used to differentiate between virtual
>> machines.
>>
>> The VM-Generation ID is a feature defined by Microsoft (paper:
>> http://go.microsoft.com/fwlink/?LinkId=260709) and supported by multiple
>> hypervisor vendors.
>>
>> Signed-off-by: Or Idgar <[email protected]>
>
> I think it's a good idea to use sysfs for this. However,
> there are a couple of missing interfaces here:
>
> 1. Userspace needs a way to know when this value changes.
> I see no change notifications here and that does not seem right.
We have the next version which includes notification. The problem is
that right now QEMU doesn't include a mean to change the generation id
on-the-fly, so it was not published it.
> 2. Userspace needs to be able to read these without
> system calls. Pls add mmap support to the raw format.
> (Phys address is not guaranteed to be page-aligned so you will
> probably want an offset attribute for that as well).
I don't agree that this should be a requirement. According to the
specs, the value doesn't change frequently. A notification about a
change the re-reading the value should be enough for now.
> While it's possible to add these later in theory, it's
> easier if userspace can rely on all interfaces being
> in place just by detecting the directory presence.
>
>> ---
>>
>> Changes in v5:
>> - added to VMGENID module dependency on ACPI module.
>>
>> Documentation/ABI/testing/sysfs-hypervisor | 13 +++
>> drivers/misc/Kconfig | 7 ++
>> drivers/misc/Makefile | 1 +
>> drivers/misc/vmgenid.c | 142 +++++++++++++++++++++++++++++
>
> Do you want to add this under QEMU MACHINE EMULATOR in MAINTAINERS?
> This way e.g. qemu-devel will be copied.
This feature is supported by other hypervisors and this driver should
be able to support them in the next versions. I don't see a reason to
bound it to QEMU.
>> 4 files changed, 163 insertions(+)
>> create mode 100644 Documentation/ABI/testing/sysfs-hypervisor
>> create mode 100644 drivers/misc/vmgenid.c
>>
>> diff --git a/Documentation/ABI/testing/sysfs-hypervisor b/Documentation/ABI/testing/sysfs-hypervisor
>> new file mode 100644
>> index 000000000000..2f9a7b8eab70
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-hypervisor
>> @@ -0,0 +1,13 @@
>> +What: /sys/hypervisor/vm_gen_counter
>
> It's not a counter, is it?
> I'd go with "vm_generation_id" here. Eschew abbreviation.
The names were chosen so they'll match Microsoft's specification and
code examples.
>> +Date: February 2018
>> +Contact: Or Idgar <[email protected]>
>> + Gal Hammer <[email protected]>
>> +Description: Expose the virtual machine generation ID. The directory
>> + contains two files: "generation_id" and "raw". Both files
>> + represent the same information.
>> +
>> + "generation_id" file is a UUID string
>
> And this I'd call "uuid" then.
Why? The name says what the value is, not its type. This is not common
to see values' types in the sysfs directory.
>> + representation.
>> +
>> + "raw" file is a 128-bit integer
>> + representation (binary).
>> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
>> index 03605f8fc0dc..a39feff6a867 100644
>> --- a/drivers/misc/Kconfig
>> +++ b/drivers/misc/Kconfig
>> @@ -500,6 +500,13 @@ config MISC_RTSX
>> tristate
>> default MISC_RTSX_PCI || MISC_RTSX_USB
>>
>> +config VMGENID
>> + depends on ACPI
>> + tristate "Virtual Machine Generation ID driver"
>> + help
>> + This is a Virtual Machine Generation ID driver which provides
>> + a virtual machine unique identifier.
>> +
>> source "drivers/misc/c2port/Kconfig"
>> source "drivers/misc/eeprom/Kconfig"
>> source "drivers/misc/cb710/Kconfig"
>> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
>> index c3c8624f4d95..067aa666bb6a 100644
>> --- a/drivers/misc/Makefile
>> +++ b/drivers/misc/Makefile
>> @@ -57,6 +57,7 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP) += aspeed-lpc-snoop.o
>> obj-$(CONFIG_PCI_ENDPOINT_TEST) += pci_endpoint_test.o
>> obj-$(CONFIG_OCXL) += ocxl/
>> obj-$(CONFIG_MISC_RTSX) += cardreader/
>> +obj-$(CONFIG_VMGENID) += vmgenid.o
>>
>> lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
>> lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
>> diff --git a/drivers/misc/vmgenid.c b/drivers/misc/vmgenid.c
>> new file mode 100644
>> index 000000000000..6c8d8fe75335
>> --- /dev/null
>> +++ b/drivers/misc/vmgenid.c
>> @@ -0,0 +1,142 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Virtual Machine Generation ID driver
>> + *
>> + * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
>> + * Authors:
>> + * Or Idgar <[email protected]>
>> + * Gal Hammer <[email protected]>
>> + *
>> + */
>> +#include <linux/init.h>
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/kobject.h>
>> +#include <linux/acpi.h>
>> +#include <linux/uuid.h>
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Or Idgar <[email protected]>");
>> +MODULE_AUTHOR("Gal Hammer <[email protected]>");
>> +MODULE_DESCRIPTION("Virtual Machine Generation ID");
>> +MODULE_VERSION("0.1");
>> +
>> +ACPI_MODULE_NAME("vmgenid");
>> +
>> +static u64 phys_addr;
>> +
>> +static ssize_t generation_id_show(struct device *_d,
>> + struct device_attribute *attr, char *buf)
>> +{
>> + void __iomem *uuid_map;
>> + uuid_t uuid;
>> + ssize_t result;
>> +
>> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
>> + if (!uuid_map)
>> + return -EFAULT;
>
> Shouldn't this be acpi_os_map_memory? Spec says it's never an IO address.
>
>> +
>> + memcpy_fromio(&uuid, uuid_map, sizeof(uuid_t));
>> + result = sprintf(buf, "%pUl\n", &uuid);
>> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
>> + return result;
>> +}
>> +static DEVICE_ATTR_RO(generation_id);
>> +
>> +static ssize_t raw_show(struct device *_d,
>> + struct device_attribute *attr,
>> + char *buf)
>> +{
>> + void __iomem *uuid_map;
>> +
>> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
>> + if (!uuid_map)
>> + return -EFAULT;
>> + memcpy_fromio(buf, uuid_map, sizeof(uuid_t));
>> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
>> + return sizeof(uuid_t);
>> +}
>> +static DEVICE_ATTR_RO(raw);
>
> I think you want BIN_ATTR_RO.
>
>
>> +
>> +static struct attribute *vmgenid_attrs[] = {
>> + &dev_attr_generation_id.attr,
>> + &dev_attr_raw.attr,
>> + NULL,
>> +};
>> +static const struct attribute_group vmgenid_group = {
>> + .name = "vm_gen_counter",
>> + .attrs = vmgenid_attrs,
>> +};
>> +
>> +static int get_vmgenid(acpi_handle handle)
>> +{
>> + int i;
>> + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
>> + acpi_status status;
>> + union acpi_object *pss;
>> + union acpi_object *element;
>> +
>> + status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer);
>> + if (ACPI_FAILURE(status)) {
>> + ACPI_EXCEPTION((AE_INFO, status, "Evaluating _ADDR"));
>
> It's ADDR - not _ADDR, right?
>
>> + return -ENODEV;
>> + }
>> + pss = buffer.pointer;
>> + if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2)
>> + return -EFAULT;
>> +
>> + phys_addr = 0;
>> + for (i = 0; i < pss->package.count; i++) {
>> + element = &(pss->package.elements[i]);
>> + if (element->type != ACPI_TYPE_INTEGER)
>> + return -EFAULT;
>
> EINVAL here and elsewhere.
>
>> + phys_addr |= element->integer.value << i*32;
>
> i * 32
>
>> + }
>> + return 0;
>> +}
>> +
>> +static int acpi_vmgenid_add(struct acpi_device *device)
>> +{
>> + int retval;
>> +
>> + if (!device)
>> + return -EINVAL;
>> + retval = get_vmgenid(device->handle);
>> + if (retval < 0)
>> + return retval;
>> + return sysfs_create_group(hypervisor_kobj, &vmgenid_group);
>> +}
>> +
>> +static int acpi_vmgenid_remove(struct acpi_device *device)
>> +{
>> + sysfs_remove_group(hypervisor_kobj, &vmgenid_group);
>> + return 0;
>> +}
>> +
>> +static const struct acpi_device_id vmgenid_ids[] = {
>> + {"QEMUVGID", 0},
>> + {"", 0},
>> +};
>
> That's not right I think. You should go by _CID and maybe
> _DDN.
You are probably right on this. However, we didn't see that Linux
supports loading ACPI modules other than using the _HID value.
>>
>> +
>> +static struct acpi_driver acpi_vmgenid_driver = {
>> + .name = "vm_gen_counter",
>> + .ids = vmgenid_ids,
>> + .owner = THIS_MODULE,
>> + .ops = {
>> + .add = acpi_vmgenid_add,
>> + .remove = acpi_vmgenid_remove,
>> + }
>> +};
>> +
>> +static int __init vmgenid_init(void)
>> +{
>> + return acpi_bus_register_driver(&acpi_vmgenid_driver);
>> +}
>> +
>> +static void __exit vmgenid_exit(void)
>> +{
>> + acpi_bus_unregister_driver(&acpi_vmgenid_driver);
>> +}
>> +
>> +module_init(vmgenid_init);
>> +module_exit(vmgenid_exit);
>
> Thanks for your work, and I hope this helps!
>
> --
> MST
Thanks,
Gal.
On Thu, Mar 15, 2018 at 08:57:05AM +0200, Gal Hammer wrote:
> On Wed, Mar 14, 2018 at 9:25 PM, Michael S. Tsirkin <[email protected]> wrote:
> > On Wed, Mar 14, 2018 at 07:25:36PM +0100, Greg KH wrote:
> >> On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
> >> > I think it's a good idea to use sysfs for this. However,
> >> > there are a couple of missing interfaces here:
> >> >
> >> > 1. Userspace needs a way to know when this value changes.
> >> > I see no change notifications here and that does not seem right.
> >>
> >> How can these change?
> >
> > It's a hardware register. It changes when hardware feels like it :)
> > In particular, it changes whenever VM is migrated or snapshotted.
>
> This value doesn't change when a VM is migrated. It is unlikely that
> this value will be changed so frequently that a direct access to the
> memory is required. Even in QEMU, the current implementation was
> merged without an option to change the generation id on-the-fly. One
> must run a new instance in order to set a new value, which means that
> no application will be running during that time.
The point is still that it changes without an application
or the kernel doing anything.
> >> > 2. Userspace needs to be able to read these without
> >> > system calls.
> >>
> >> Ick, what? Why not?
> >>
> >> > Pls add mmap support to the raw format.
> >>
> >> For a single integer? Why do you need mmap for this? What is so
> >> "performant" that needs to touch a sysfs file?
> >> > (Phys address is not guaranteed to be page-aligned so you will
> >> > probably want an offset attribute for that as well).
> >>
> >> Ick ick ick, that's why it's good to just stick with a sysfs file.
>
> I agree with Greg here. The user is able to read the value, and then
> wait for a notification if she cares about changes.
OK. Patch has to implement notifications for this to work though.
That's missing.
> >> Have you tested just how long this takes to see if the open/read/close
> >> is really the bottleneck, or if the io on reading the value is the
> >> bottleneck?
> >>
> >> thanks,
> >>
> >> greg k-h
> >
> > Well an application needs to check this value basically after
> > every database transaction. So I'm pretty sure it's a performance
> > sensitive path. But yes, I didn't profile any apps since they
> > are yet to be written to use this interface.
> > I'm fine deferring point 2 for now.
> >
> > --
> > MST
>
> Thanks,
>
> Gal.
On Wed, Mar 14, 2018 at 09:25:17PM +0200, Michael S. Tsirkin wrote:
> On Wed, Mar 14, 2018 at 07:25:36PM +0100, Greg KH wrote:
> > On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
> > > I think it's a good idea to use sysfs for this. However,
> > > there are a couple of missing interfaces here:
> > >
> > > 1. Userspace needs a way to know when this value changes.
> > > I see no change notifications here and that does not seem right.
> >
> > How can these change?
>
> It's a hardware register. It changes when hardware feels like it :)
Does the hardware notify the kernel that it changes? Or does the kernel
have to "poll" for it?
> In particular, it changes whenever VM is migrated or snapshotted.
So very rarely. And userspace always knows about those events already,
right?
> > > 2. Userspace needs to be able to read these without
> > > system calls.
> >
> > Ick, what? Why not?
> >
> > > Pls add mmap support to the raw format.
> >
> > For a single integer? Why do you need mmap for this? What is so
> > "performant" that needs to touch a sysfs file?
> > > (Phys address is not guaranteed to be page-aligned so you will
> > > probably want an offset attribute for that as well).
> >
> > Ick ick ick, that's why it's good to just stick with a sysfs file.
> >
> > Have you tested just how long this takes to see if the open/read/close
> > is really the bottleneck, or if the io on reading the value is the
> > bottleneck?
> >
> > thanks,
> >
> > greg k-h
>
> Well an application needs to check this value basically after
> every database transaction.
"every"? That's horrid, why would you write a database that has to do
an ACPI i/o call for every transaction? That's a sure way to write a
very slow database :(
> So I'm pretty sure it's a performance sensitive path.
Given that this api is not present today, why is this even needed? Who
wants/needs it so badly that it has to be tuned in ways like this?
If it is _really_ performant critical, just make it a new syscall :)
> But yes, I
> didn't profile any apps since they
> are yet to be written to use this interface.
Then what database are you talking about? What apps need/want this
thing?
thanks,
greg k-h
On Thu, Mar 15, 2018 at 10:00:30AM +0200, Gal Hammer wrote:
> On Tue, Mar 13, 2018 at 7:40 PM, Michael S. Tsirkin <[email protected]> wrote:
> > Thanks for the patch! Yet something to improve (see below):
>
> Thanks for the review.
>
> > On Thu, Mar 01, 2018 at 04:22:15PM +0200, Or Idgar wrote:
> >> From: Or Idgar <[email protected]>
> >
> > I see addresses at gmail, virtualoco and redhat.com At this point I
> > don't really know which address to use to contact you :) All of them?
>
> Use his gmail or virtualoco one.
>
> > Also, I think it's a good idea to CC this more widely. Consider CC-ing
> > qemu contributors to the vm gen id (use git log to get the list), Ben
> > Warren who wrote a prototype driver a while ago, qemu mailing list,
> > maybe more.
> >
> >>
> >> This patch is a driver which expose the Virtual Machine Generation ID
> >> via sysfs.
> >>
> >> The ID is a UUID value used to differentiate between virtual
> >> machines.
> >>
> >> The VM-Generation ID is a feature defined by Microsoft (paper:
> >> http://go.microsoft.com/fwlink/?LinkId=260709) and supported by multiple
> >> hypervisor vendors.
> >>
> >> Signed-off-by: Or Idgar <[email protected]>
> >
> > I think it's a good idea to use sysfs for this. However,
> > there are a couple of missing interfaces here:
> >
> > 1. Userspace needs a way to know when this value changes.
> > I see no change notifications here and that does not seem right.
>
> We have the next version which includes notification. The problem is
> that right now QEMU doesn't include a mean to change the generation id
> on-the-fly, so it was not published it.
It seems to send this notification on each migration or
resume from snapshot.
> > system calls. Pls add mmap support to the raw format.
> > (Phys address is not guaranteed to be page-aligned so you will
> > probably want an offset attribute for that as well).
>
> I don't agree that this should be a requirement. According to the
> specs, the value doesn't change frequently.
What matters is whether it's read frequently, not whether it
changes frequently.
Still, I agree we can defer that
for now until apps actually start using the new interface.
> A notification about a
> change the re-reading the value should be enough for now.
Notifications are asynchronous so I am not sure it's robust to rely on
them. So I suspect we'll need it down the road but I agree to defer that
for now until apps actually start using the new interface.
> > While it's possible to add these later in theory, it's
> > easier if userspace can rely on all interfaces being
> > in place just by detecting the directory presence.
> >
> >> ---
> >>
> >> Changes in v5:
> >> - added to VMGENID module dependency on ACPI module.
> >>
> >> Documentation/ABI/testing/sysfs-hypervisor | 13 +++
> >> drivers/misc/Kconfig | 7 ++
> >> drivers/misc/Makefile | 1 +
> >> drivers/misc/vmgenid.c | 142 +++++++++++++++++++++++++++++
> >
> > Do you want to add this under QEMU MACHINE EMULATOR in MAINTAINERS?
> > This way e.g. qemu-devel will be copied.
>
> This feature is supported by other hypervisors and this driver should
> be able to support them in the next versions. I don't see a reason to
> bound it to QEMU.
Go ahead and add the microsoft list too, whatever it is.
Point is you want people familiar with the spec to
look at patches.
> >> 4 files changed, 163 insertions(+)
> >> create mode 100644 Documentation/ABI/testing/sysfs-hypervisor
> >> create mode 100644 drivers/misc/vmgenid.c
> >>
> >> diff --git a/Documentation/ABI/testing/sysfs-hypervisor b/Documentation/ABI/testing/sysfs-hypervisor
> >> new file mode 100644
> >> index 000000000000..2f9a7b8eab70
> >> --- /dev/null
> >> +++ b/Documentation/ABI/testing/sysfs-hypervisor
> >> @@ -0,0 +1,13 @@
> >> +What: /sys/hypervisor/vm_gen_counter
> >
> > It's not a counter, is it?
> > I'd go with "vm_generation_id" here. Eschew abbreviation.
>
> The names were chosen so they'll match Microsoft's specification and
> code examples.
I don't think this makes sense. We are not going to use '\' in file
names to match Microsoft code examples, either.
Maybe they say counter in some docs because someone somewhere uses this
as a counter. Or maybe some architect wrote the code examples before the
name was changed from gen counter to generation id and did not bother
changing the code examples. Who knows - there is still no point in
building an API around that.
In particular QEMU certainly does not pass a counter in that field,
and spec does not say it should.
> >> +Date: February 2018
> >> +Contact: Or Idgar <[email protected]>
> >> + Gal Hammer <[email protected]>
> >> +Description: Expose the virtual machine generation ID. The directory
> >> + contains two files: "generation_id" and "raw". Both files
> >> + represent the same information.
> >> +
> >> + "generation_id" file is a UUID string
> >
> > And this I'd call "uuid" then.
>
> Why? The name says what the value is, not its type. This is not common
> to see values' types in the sysfs directory.
Look at the hierarchy:
/sys/hypervisor/vm_gen_counter/generation_id
/sys/hypervisor/vm_gen_counter/raw
Naming it vm_gen_counter/generation_id makes it seem like
there is a gen counter (general counter?) which
allows some kind of raw access and which also has
a generation id.
But in reality what is going on here is that there is a single value
which is a vm generation id, and we present it in two formats: uuid and raw.
> >> + representation.
> >> +
> >> + "raw" file is a 128-bit integer
> >> + representation (binary).
> >> diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
> >> index 03605f8fc0dc..a39feff6a867 100644
> >> --- a/drivers/misc/Kconfig
> >> +++ b/drivers/misc/Kconfig
> >> @@ -500,6 +500,13 @@ config MISC_RTSX
> >> tristate
> >> default MISC_RTSX_PCI || MISC_RTSX_USB
> >>
> >> +config VMGENID
> >> + depends on ACPI
> >> + tristate "Virtual Machine Generation ID driver"
> >> + help
> >> + This is a Virtual Machine Generation ID driver which provides
> >> + a virtual machine unique identifier.
> >> +
> >> source "drivers/misc/c2port/Kconfig"
> >> source "drivers/misc/eeprom/Kconfig"
> >> source "drivers/misc/cb710/Kconfig"
BTW maybe we want to make this depend on CONFIG_PARAVIRT as well.
> >> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
> >> index c3c8624f4d95..067aa666bb6a 100644
> >> --- a/drivers/misc/Makefile
> >> +++ b/drivers/misc/Makefile
> >> @@ -57,6 +57,7 @@ obj-$(CONFIG_ASPEED_LPC_SNOOP) += aspeed-lpc-snoop.o
> >> obj-$(CONFIG_PCI_ENDPOINT_TEST) += pci_endpoint_test.o
> >> obj-$(CONFIG_OCXL) += ocxl/
> >> obj-$(CONFIG_MISC_RTSX) += cardreader/
> >> +obj-$(CONFIG_VMGENID) += vmgenid.o
> >>
> >> lkdtm-$(CONFIG_LKDTM) += lkdtm_core.o
> >> lkdtm-$(CONFIG_LKDTM) += lkdtm_bugs.o
> >> diff --git a/drivers/misc/vmgenid.c b/drivers/misc/vmgenid.c
> >> new file mode 100644
> >> index 000000000000..6c8d8fe75335
> >> --- /dev/null
> >> +++ b/drivers/misc/vmgenid.c
> >> @@ -0,0 +1,142 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Virtual Machine Generation ID driver
> >> + *
> >> + * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
> >> + * Authors:
> >> + * Or Idgar <[email protected]>
> >> + * Gal Hammer <[email protected]>
> >> + *
> >> + */
> >> +#include <linux/init.h>
> >> +#include <linux/module.h>
> >> +#include <linux/kernel.h>
> >> +#include <linux/kobject.h>
> >> +#include <linux/acpi.h>
> >> +#include <linux/uuid.h>
> >> +
> >> +MODULE_LICENSE("GPL");
> >> +MODULE_AUTHOR("Or Idgar <[email protected]>");
> >> +MODULE_AUTHOR("Gal Hammer <[email protected]>");
> >> +MODULE_DESCRIPTION("Virtual Machine Generation ID");
> >> +MODULE_VERSION("0.1");
> >> +
> >> +ACPI_MODULE_NAME("vmgenid");
> >> +
> >> +static u64 phys_addr;
> >> +
> >> +static ssize_t generation_id_show(struct device *_d,
> >> + struct device_attribute *attr, char *buf)
> >> +{
> >> + void __iomem *uuid_map;
> >> + uuid_t uuid;
> >> + ssize_t result;
> >> +
> >> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
> >> + if (!uuid_map)
> >> + return -EFAULT;
> >
> > Shouldn't this be acpi_os_map_memory? Spec says it's never an IO address.
> >
> >> +
> >> + memcpy_fromio(&uuid, uuid_map, sizeof(uuid_t));
> >> + result = sprintf(buf, "%pUl\n", &uuid);
> >> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
> >> + return result;
> >> +}
> >> +static DEVICE_ATTR_RO(generation_id);
> >> +
> >> +static ssize_t raw_show(struct device *_d,
> >> + struct device_attribute *attr,
> >> + char *buf)
> >> +{
> >> + void __iomem *uuid_map;
> >> +
> >> + uuid_map = acpi_os_map_iomem(phys_addr, sizeof(uuid_t));
> >> + if (!uuid_map)
> >> + return -EFAULT;
> >> + memcpy_fromio(buf, uuid_map, sizeof(uuid_t));
> >> + acpi_os_unmap_iomem(uuid_map, sizeof(uuid_t));
> >> + return sizeof(uuid_t);
> >> +}
> >> +static DEVICE_ATTR_RO(raw);
> >
> > I think you want BIN_ATTR_RO.
> >
> >
> >> +
> >> +static struct attribute *vmgenid_attrs[] = {
> >> + &dev_attr_generation_id.attr,
> >> + &dev_attr_raw.attr,
> >> + NULL,
> >> +};
> >> +static const struct attribute_group vmgenid_group = {
> >> + .name = "vm_gen_counter",
> >> + .attrs = vmgenid_attrs,
> >> +};
> >> +
> >> +static int get_vmgenid(acpi_handle handle)
> >> +{
> >> + int i;
> >> + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
> >> + acpi_status status;
> >> + union acpi_object *pss;
> >> + union acpi_object *element;
> >> +
> >> + status = acpi_evaluate_object(handle, "ADDR", NULL, &buffer);
> >> + if (ACPI_FAILURE(status)) {
> >> + ACPI_EXCEPTION((AE_INFO, status, "Evaluating _ADDR"));
> >
> > It's ADDR - not _ADDR, right?
> >
> >> + return -ENODEV;
> >> + }
> >> + pss = buffer.pointer;
> >> + if (!pss || pss->type != ACPI_TYPE_PACKAGE || pss->package.count != 2)
> >> + return -EFAULT;
> >> +
> >> + phys_addr = 0;
> >> + for (i = 0; i < pss->package.count; i++) {
> >> + element = &(pss->package.elements[i]);
> >> + if (element->type != ACPI_TYPE_INTEGER)
> >> + return -EFAULT;
> >
> > EINVAL here and elsewhere.
> >
> >> + phys_addr |= element->integer.value << i*32;
> >
> > i * 32
> >
> >> + }
> >> + return 0;
> >> +}
> >> +
> >> +static int acpi_vmgenid_add(struct acpi_device *device)
> >> +{
> >> + int retval;
> >> +
> >> + if (!device)
> >> + return -EINVAL;
> >> + retval = get_vmgenid(device->handle);
> >> + if (retval < 0)
> >> + return retval;
> >> + return sysfs_create_group(hypervisor_kobj, &vmgenid_group);
> >> +}
> >> +
> >> +static int acpi_vmgenid_remove(struct acpi_device *device)
> >> +{
> >> + sysfs_remove_group(hypervisor_kobj, &vmgenid_group);
> >> + return 0;
> >> +}
> >> +
> >> +static const struct acpi_device_id vmgenid_ids[] = {
> >> + {"QEMUVGID", 0},
> >> + {"", 0},
> >> +};
> >
> > That's not right I think. You should go by _CID and maybe
> > _DDN.
>
> You are probably right on this. However, we didn't see that Linux
> supports loading ACPI modules other than using the _HID value.
Are you sure about this?
if (info->valid & ACPI_VALID_CID) {
cid_list = &info->compatible_id_list;
for (i = 0; i < cid_list->count; i++)
acpi_add_id(pnp, cid_list->ids[i].string);
}
Anyway, spec is pretty clear on this so we have to make it work.
> >>
> >> +
> >> +static struct acpi_driver acpi_vmgenid_driver = {
> >> + .name = "vm_gen_counter",
> >> + .ids = vmgenid_ids,
> >> + .owner = THIS_MODULE,
> >> + .ops = {
> >> + .add = acpi_vmgenid_add,
> >> + .remove = acpi_vmgenid_remove,
> >> + }
> >> +};
> >> +
> >> +static int __init vmgenid_init(void)
> >> +{
> >> + return acpi_bus_register_driver(&acpi_vmgenid_driver);
> >> +}
> >> +
> >> +static void __exit vmgenid_exit(void)
> >> +{
> >> + acpi_bus_unregister_driver(&acpi_vmgenid_driver);
> >> +}
> >> +
> >> +module_init(vmgenid_init);
> >> +module_exit(vmgenid_exit);
> >
> > Thanks for your work, and I hope this helps!
> >
> > --
> > MST
>
> Thanks,
>
> Gal.
On Thu, Mar 15, 2018 at 02:19:59PM +0100, Greg KH wrote:
> On Wed, Mar 14, 2018 at 09:25:17PM +0200, Michael S. Tsirkin wrote:
> > On Wed, Mar 14, 2018 at 07:25:36PM +0100, Greg KH wrote:
> > > On Tue, Mar 13, 2018 at 07:40:51PM +0200, Michael S. Tsirkin wrote:
> > > > I think it's a good idea to use sysfs for this. However,
> > > > there are a couple of missing interfaces here:
> > > >
> > > > 1. Userspace needs a way to know when this value changes.
> > > > I see no change notifications here and that does not seem right.
> > >
> > > How can these change?
> >
> > It's a hardware register. It changes when hardware feels like it :)
>
> Does the hardware notify the kernel that it changes? Or does the kernel
> have to "poll" for it?
Yes - it sends an ACPI interrupt notification.
> > In particular, it changes whenever VM is migrated or snapshotted.
>
> So very rarely. And userspace always knows about those events already,
> right?
Not at all.
> > > > 2. Userspace needs to be able to read these without
> > > > system calls.
> > >
> > > Ick, what? Why not?
> > >
> > > > Pls add mmap support to the raw format.
> > >
> > > For a single integer? Why do you need mmap for this? What is so
> > > "performant" that needs to touch a sysfs file?
> > > > (Phys address is not guaranteed to be page-aligned so you will
> > > > probably want an offset attribute for that as well).
> > >
> > > Ick ick ick, that's why it's good to just stick with a sysfs file.
> > >
> > > Have you tested just how long this takes to see if the open/read/close
> > > is really the bottleneck, or if the io on reading the value is the
> > > bottleneck?
> > >
> > > thanks,
> > >
> > > greg k-h
> >
> > Well an application needs to check this value basically after
> > every database transaction.
>
> "every"? That's horrid, why would you write a database that has to do
> an ACPI i/o call for every transaction? That's a sure way to write a
> very slow database :(
Absolutely. That's why we might want to do an mmap and then get the ID
from memory. An alternative is poll support so userspace can get
notified about changes. Opens a bigger window during which you
are doing duplicate work, but maybe that's not such a big issue.
> > So I'm pretty sure it's a performance sensitive path.
>
> Given that this api is not present today, why is this even needed? Who
> wants/needs it so badly that it has to be tuned in ways like this?
>
> If it is _really_ performant critical, just make it a new syscall :)
Maybe you are right and it is a premature optimization. Let's put it out
there without mmap support and see what happens - but then we definitely
need poll support.
> > But yes, I
> > didn't profile any apps since they
> > are yet to be written to use this interface.
>
> Then what database are you talking about? What apps need/want this
> thing?
>
> thanks,
>
> greg k-h
Anything that runs within a VM that is snapshoted is at risk
of sending duplicate transactions when it's restored and
time rolls back to a random point in the past.
If you want the application to have ability to detect these, then VM gen
ID offers a way to do it:
id=read_id()
do_work()
new_id=read_id()
if (new_id != id)
find_and_handle_duplicate_work()
--
MST
On Thu, 2018-03-01 at 16:22 +0200, Or Idgar wrote:
> +
> +static const struct acpi_device_id vmgenid_ids[] = {
> + {"QEMUVGID", 0},
> + {"", 0},
> +};
> +
This is wrong. You're supposed to match on the _CID "VM_Gen_Counter".