From: Oleksandr Tyshchenko <[email protected]>
Use the same "xen-grant-dma" device concept (based on generic IOMMU
device-tree bindings) for the PCI devices behind device-tree based
PCI Host controller.
Signed-off-by: Oleksandr Tyshchenko <[email protected]>
---
Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
on Arm at some point in the future. The Xen toolstack side is not published yet.
Here, for PCI devices we use the same way to pass backend domid to the guest as for
platform devices.
Depends on Juergen's series:
https://lore.kernel.org/xen-devel/[email protected]/
---
drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
1 file changed, 41 insertions(+), 10 deletions(-)
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index ff9be3aff87e..79d13122ec08 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/dma-map-ops.h>
#include <linux/of.h>
+#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/xarray.h>
#include <linux/virtio_anchor.h>
@@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
.dma_supported = xen_grant_dma_supported,
};
-static bool xen_is_dt_grant_dma_device(struct device *dev)
+static struct device_node *xen_dt_get_node(struct device *dev)
+{
+ if (dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_bus *bus = pdev->bus;
+
+ /* Walk up to the root bus to look for PCI Host controller */
+ while (!pci_is_root_bus(bus))
+ bus = bus->parent;
+
+ return of_node_get(bus->bridge->parent->of_node);
+ }
+
+ return of_node_get(dev->of_node);
+}
+
+static bool xen_is_dt_grant_dma_device(struct device_node *np)
{
struct device_node *iommu_np;
bool has_iommu;
- iommu_np = of_parse_phandle(dev->of_node, "iommus", 0);
+ iommu_np = of_parse_phandle(np, "iommus", 0);
has_iommu = iommu_np &&
of_device_is_compatible(iommu_np, "xen,grant-dma");
of_node_put(iommu_np);
@@ -288,9 +305,17 @@ static bool xen_is_dt_grant_dma_device(struct device *dev)
bool xen_is_grant_dma_device(struct device *dev)
{
+ struct device_node *np;
+
/* XXX Handle only DT devices for now */
- if (dev->of_node)
- return xen_is_dt_grant_dma_device(dev);
+ np = xen_dt_get_node(dev);
+ if (np) {
+ bool ret;
+
+ ret = xen_is_dt_grant_dma_device(np);
+ of_node_put(np);
+ return ret;
+ }
return false;
}
@@ -303,20 +328,20 @@ bool xen_virtio_mem_acc(struct virtio_device *dev)
return xen_is_grant_dma_device(dev->dev.parent);
}
-static int xen_dt_grant_setup_dma_ops(struct device *dev,
+static int xen_dt_grant_setup_dma_ops(struct device_node *np,
struct xen_grant_dma_data *data)
{
struct of_phandle_args iommu_spec;
- if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
+ if (of_parse_phandle_with_args(np, "iommus", "#iommu-cells",
0, &iommu_spec)) {
- dev_err(dev, "Cannot parse iommus property\n");
+ pr_err("%s: Cannot parse iommus property\n", np->name);
return -ESRCH;
}
if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
iommu_spec.args_count != 1) {
- dev_err(dev, "Incompatible IOMMU node\n");
+ pr_err("%s: Incompatible IOMMU node\n", iommu_spec.np->name);
of_node_put(iommu_spec.np);
return -ESRCH;
}
@@ -335,6 +360,7 @@ static int xen_dt_grant_setup_dma_ops(struct device *dev,
void xen_grant_setup_dma_ops(struct device *dev)
{
struct xen_grant_dma_data *data;
+ struct device_node *np;
data = find_xen_grant_dma_data(dev);
if (data) {
@@ -346,8 +372,13 @@ void xen_grant_setup_dma_ops(struct device *dev)
if (!data)
goto err;
- if (dev->of_node) {
- if (xen_dt_grant_setup_dma_ops(dev, data))
+ np = xen_dt_get_node(dev);
+ if (np) {
+ int ret;
+
+ ret = xen_dt_grant_setup_dma_ops(np, data);
+ of_node_put(np);
+ if (ret)
goto err;
} else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) {
dev_info(dev, "Using dom0 as backend\n");
--
2.25.1
On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <[email protected]>
>
> Use the same "xen-grant-dma" device concept (based on generic IOMMU
> device-tree bindings) for the PCI devices behind device-tree based
> PCI Host controller.
>
> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
> ---
> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
> on Arm at some point in the future. The Xen toolstack side is not published yet.
> Here, for PCI devices we use the same way to pass backend domid to the guest as for
> platform devices.
>
> Depends on Juergen's series:
> https://lore.kernel.org/xen-devel/[email protected]/
> ---
> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
> 1 file changed, 41 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
> index ff9be3aff87e..79d13122ec08 100644
> --- a/drivers/xen/grant-dma-ops.c
> +++ b/drivers/xen/grant-dma-ops.c
> @@ -10,6 +10,7 @@
> #include <linux/module.h>
> #include <linux/dma-map-ops.h>
> #include <linux/of.h>
> +#include <linux/pci.h>
> #include <linux/pfn.h>
> #include <linux/xarray.h>
> #include <linux/virtio_anchor.h>
> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
> .dma_supported = xen_grant_dma_supported,
> };
>
> -static bool xen_is_dt_grant_dma_device(struct device *dev)
> +static struct device_node *xen_dt_get_node(struct device *dev)
> +{
> + if (dev_is_pci(dev)) {
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct pci_bus *bus = pdev->bus;
> +
> + /* Walk up to the root bus to look for PCI Host controller */
> + while (!pci_is_root_bus(bus))
> + bus = bus->parent;
> +
> + return of_node_get(bus->bridge->parent->of_node);
> + }
Is it possible to have multiple virtio devices under a single virtio-pci
root complex? What if virtio-net has the backend in dom0 and
virtio-block has the backend in dom1?
Or each virtio PCI device shows up under a different PCI root complex?
If we can have multiple virtio PCI devices under a single PCI root
complex, then I think it would be better to check for a per-device
property, rather than a single property at the PCI root complex level.
The first thing that comes to mind is to describe each PCI device under
the root complex in device tree. Although it is uncommon (usually only
the PCI root complex is described in device tree), it is possible to
also describe in device tree all the individual PCI devices under the
root complex.
Given that the domU device tree is generated by Xen and/or the Xen
toolstack, it would be easy to arrange for it to happen.
That would solve the issue as far as I can tell, but I worry it might
not be a good idea because if we rely on the per-device device tree node
to be present then it becomes harder to implement virtio hotplug
(Virtio hotplug is important to add dom0less support.)
Let's say that we create a dom0less domU with an emulated PCI root
complex without any devices under it, then after Dom0 is fully booted,
we add a virtio-net emulated device. How do we tell the guest what is
the backend domain id?
Device tree and other firmware tables are not relevant anymore.
We could reuse a PCI config space register to expose the backend id.
However this solution requires a backend change (QEMU) to expose the
backend id via an emulated register for each emulated device.
To avoid having to introduce a special config space register in all
emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
could add a special PCI config space register at the emulated PCI Root
Complex level.
Basically the workflow would be as follow:
- Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
- Linux writes to special PCI config space register of the Xen PCI Root
Complex the PCI device id (basically the BDF)
- The Xen PCI Root Complex emulated by Xen answers by writing back to
the same location the backend id (domid of the backend)
- Linux reads back the same PCI config space register of the Xen PCI
Root Complex and learn the relevant domid
What do you think?
Other ideas welcome!
> + return of_node_get(dev->of_node);
> +}
> +
> +static bool xen_is_dt_grant_dma_device(struct device_node *np)
> {
> struct device_node *iommu_np;
> bool has_iommu;
>
> - iommu_np = of_parse_phandle(dev->of_node, "iommus", 0);
> + iommu_np = of_parse_phandle(np, "iommus", 0);
> has_iommu = iommu_np &&
> of_device_is_compatible(iommu_np, "xen,grant-dma");
> of_node_put(iommu_np);
> @@ -288,9 +305,17 @@ static bool xen_is_dt_grant_dma_device(struct device *dev)
>
> bool xen_is_grant_dma_device(struct device *dev)
> {
> + struct device_node *np;
> +
> /* XXX Handle only DT devices for now */
> - if (dev->of_node)
> - return xen_is_dt_grant_dma_device(dev);
> + np = xen_dt_get_node(dev);
> + if (np) {
> + bool ret;
> +
> + ret = xen_is_dt_grant_dma_device(np);
> + of_node_put(np);
> + return ret;
> + }
>
> return false;
> }
> @@ -303,20 +328,20 @@ bool xen_virtio_mem_acc(struct virtio_device *dev)
> return xen_is_grant_dma_device(dev->dev.parent);
> }
>
> -static int xen_dt_grant_setup_dma_ops(struct device *dev,
> +static int xen_dt_grant_setup_dma_ops(struct device_node *np,
> struct xen_grant_dma_data *data)
> {
> struct of_phandle_args iommu_spec;
>
> - if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
> + if (of_parse_phandle_with_args(np, "iommus", "#iommu-cells",
> 0, &iommu_spec)) {
> - dev_err(dev, "Cannot parse iommus property\n");
> + pr_err("%s: Cannot parse iommus property\n", np->name);
> return -ESRCH;
> }
>
> if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
> iommu_spec.args_count != 1) {
> - dev_err(dev, "Incompatible IOMMU node\n");
> + pr_err("%s: Incompatible IOMMU node\n", iommu_spec.np->name);
> of_node_put(iommu_spec.np);
> return -ESRCH;
> }
> @@ -335,6 +360,7 @@ static int xen_dt_grant_setup_dma_ops(struct device *dev,
> void xen_grant_setup_dma_ops(struct device *dev)
> {
> struct xen_grant_dma_data *data;
> + struct device_node *np;
>
> data = find_xen_grant_dma_data(dev);
> if (data) {
> @@ -346,8 +372,13 @@ void xen_grant_setup_dma_ops(struct device *dev)
> if (!data)
> goto err;
>
> - if (dev->of_node) {
> - if (xen_dt_grant_setup_dma_ops(dev, data))
> + np = xen_dt_get_node(dev);
> + if (np) {
> + int ret;
> +
> + ret = xen_dt_grant_setup_dma_ops(np, data);
> + of_node_put(np);
> + if (ret)
> goto err;
> } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) {
> dev_info(dev, "Using dom0 as backend\n");
> --
> 2.25.1
>
On 06.10.22 19:48, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <[email protected]>
>
> Use the same "xen-grant-dma" device concept (based on generic IOMMU
> device-tree bindings) for the PCI devices behind device-tree based
> PCI Host controller.
>
> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
> ---
> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
> on Arm at some point in the future. The Xen toolstack side is not published yet.
> Here, for PCI devices we use the same way to pass backend domid to the guest as for
> platform devices.
I should mention we decided at the Xen Summit, that I will start a try to
modify the virtio spec to include the backend id (domid in the Xen case)
in the device independent config part.
As this will take some time to be accepted (if ever), other means to
specify the backend domid are needed until then. DT is one possibility
(at least on Arm), while Xenstore is the way to go for setups with a
Xen toolstack.
Juergen
On Fri, 7 Oct 2022, Juergen Gross wrote:
> On 06.10.22 19:48, Oleksandr Tyshchenko wrote:
> > From: Oleksandr Tyshchenko <[email protected]>
> >
> > Use the same "xen-grant-dma" device concept (based on generic IOMMU
> > device-tree bindings) for the PCI devices behind device-tree based
> > PCI Host controller.
> >
> > Signed-off-by: Oleksandr Tyshchenko <[email protected]>
> > ---
> > Slightly RFC. This is needed to support Xen grant mappings for virtio-pci
> > devices
> > on Arm at some point in the future. The Xen toolstack side is not published
> > yet.
> > Here, for PCI devices we use the same way to pass backend domid to the guest
> > as for
> > platform devices.
>
> I should mention we decided at the Xen Summit, that I will start a try to
> modify the virtio spec to include the backend id (domid in the Xen case)
> in the device independent config part.
Good idea
> As this will take some time to be accepted (if ever), other means to
> specify the backend domid are needed until then. DT is one possibility
> (at least on Arm), while Xenstore is the way to go for setups with a
> Xen toolstack.
What do you think of my idea of using PCI config registers on the *root
complex* (not the device) to retrieve the information?
On 07.10.22 04:14, Stefano Stabellini wrote:
Hello Stefano
Thank you for the detailed analysis. Please see answers below.
> On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <[email protected]>
>>
>> Use the same "xen-grant-dma" device concept (based on generic IOMMU
>> device-tree bindings) for the PCI devices behind device-tree based
>> PCI Host controller.
>>
>> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
>> ---
>> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
>> on Arm at some point in the future. The Xen toolstack side is not published yet.
>> Here, for PCI devices we use the same way to pass backend domid to the guest as for
>> platform devices.
>>
>> Depends on Juergen's series:
>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!waOk2Goc7qlhNo5-csRObryil_GzMF_e61EJR501oJ08cH2dnJulsZXWlelBDTBqa63TVoUcWQTB5NecJ1p4xFNgh2_EuA$ [lore[.]kernel[.]org]
>> ---
>> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
>> 1 file changed, 41 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
>> index ff9be3aff87e..79d13122ec08 100644
>> --- a/drivers/xen/grant-dma-ops.c
>> +++ b/drivers/xen/grant-dma-ops.c
>> @@ -10,6 +10,7 @@
>> #include <linux/module.h>
>> #include <linux/dma-map-ops.h>
>> #include <linux/of.h>
>> +#include <linux/pci.h>
>> #include <linux/pfn.h>
>> #include <linux/xarray.h>
>> #include <linux/virtio_anchor.h>
>> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
>> .dma_supported = xen_grant_dma_supported,
>> };
>>
>> -static bool xen_is_dt_grant_dma_device(struct device *dev)
>> +static struct device_node *xen_dt_get_node(struct device *dev)
>> +{
>> + if (dev_is_pci(dev)) {
>> + struct pci_dev *pdev = to_pci_dev(dev);
>> + struct pci_bus *bus = pdev->bus;
>> +
>> + /* Walk up to the root bus to look for PCI Host controller */
>> + while (!pci_is_root_bus(bus))
>> + bus = bus->parent;
>> +
>> + return of_node_get(bus->bridge->parent->of_node);
>> + }
> Is it possible to have multiple virtio devices under a single virtio-pci
> root complex?
yes
> What if virtio-net has the backend in dom0 and
> virtio-block has the backend in dom1?
>
> Or each virtio PCI device shows up under a different PCI root complex?
Good questions. To be honest, it is not 100% clear to me yet how it is
supposed to be. But let's guess...
I think that having a PCI Host bridge per virtio-pci device is overkill.
So, I see two options here:
1. We provide PCI Host bridge per backends domain to the guest, so each
PCI Host bridge covers only virtio-pci devices whose backends are
running within *the same* domain.
With that we would be able to use property at PCI Host bridge level.
2. We provide only a single PCI Host bridge to the guest, so that single
PCI Host bridge covers all virtio-pci devices assigned to this guest.
No matter where the corresponding backends are running (the
virtio-devices under that PCI Host bridge can have the backends in
different domains).
With that we wouldn’t be able to use property at PCI Host bridge level.
And we need a more flexible option(s) to be able distinguish between
virtio-pci devices.
Taking into account that for virtio-pci on Arm we need to emulate a
specific PCI Host bridge in Xen to intercept the guest PCI config space
accesses
(detect what PCI device is targeted) and forward them to the appropriate
backend (IOREQ Server),
it feels to me that we likely need to go with the second option here
(one PCI host bridge per guest), I may mistake,
but I don’t think that we want to emulate several PCI Host bridges for a
single guest (more code, more resources, etc).
>
> If we can have multiple virtio PCI devices under a single PCI root
> complex, then I think it would be better to check for a per-device
> property, rather than a single property at the PCI root complex level.
Completely agree.
>
> The first thing that comes to mind is to describe each PCI device under
> the root complex in device tree. Although it is uncommon (usually only
> the PCI root complex is described in device tree), it is possible to
> also describe in device tree all the individual PCI devices under the
> root complex.
>
> Given that the domU device tree is generated by Xen and/or the Xen
> toolstack, it would be easy to arrange for it to happen.
Technically yes. If we decide to provide only a single PCI Host bridge
to the guest, we will have have to deal with the virtio-pci devices with
various backend_domid,
so we can consider using more flexible property
“iommu-map”/”iommu-map-mask” specially introduced for such purposes:
https://www.kernel.org/doc/Documentation/devicetree/bindings/pci/pci-iommu.txt
I think, with that we would be able to describe, i.e that virtio-pci
device A connects to stream_id (backend_domid) X and virtio-pci device B
to stream_id Y,
and virtio-pci device C to nothing (so is not required to use grants),
unless I missed something.
I have looked at it and I don’t see at the moment why the idea wouldn’t
work, but I haven’t experimented with that yet in such context.
>
> That would solve the issue as far as I can tell, but I worry it might
> not be a good idea because if we rely on the per-device device tree node
> to be present then it becomes harder to implement virtio hotplug
> (Virtio hotplug is important to add dom0less support.)
>
> Let's say that we create a dom0less domU with an emulated PCI root
> complex without any devices under it, then after Dom0 is fully booted,
> we add a virtio-net emulated device. How do we tell the guest what is
> the backend domain id?
>
> Device tree and other firmware tables are not relevant anymore.
>
> We could reuse a PCI config space register to expose the backend id.
> However this solution requires a backend change (QEMU) to expose the
> backend id via an emulated register for each emulated device.
>
> To avoid having to introduce a special config space register in all
> emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
> could add a special PCI config space register at the emulated PCI Root
> Complex level.
>
> Basically the workflow would be as follow:
>
> - Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
> - Linux writes to special PCI config space register of the Xen PCI Root
> Complex the PCI device id (basically the BDF)
> - The Xen PCI Root Complex emulated by Xen answers by writing back to
> the same location the backend id (domid of the backend)
> - Linux reads back the same PCI config space register of the Xen PCI
> Root Complex and learn the relevant domid
>
> What do you think?
I think the idea sounds indeed interesting and would probably work, but
would require guest modifications other than just in drivers/xen (and
likely the specification changes as well).
Which ideally of course should be avoided.
Also I was thinking it would be nice not to diverge much between
communicating the backend_domid for platform and PCI devices on Arm with
device tree.
If we managed to re-use generic IOMMU device-tree bindings for
virtio-mmio, we would likely be able to re-use PCI-IOMMU device-tree
bindings for virtio-pci,
at least for boot PCI devices (which are known at the domain creation time).
The more, the bindings is already present:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml
Regarding the hotplug devices, yes it is a valid use-case which should
be also supported with virtio-pci, I assume the Xenstore could be
re-used for that purpose if it would be available.
The Xenstore is available with the usual Dom0 and toolstack, is it
available with dom0less?
>
> Other ideas welcome!
>
>
>
>> + return of_node_get(dev->of_node);
>> +}
>> +
>> +static bool xen_is_dt_grant_dma_device(struct device_node *np)
>> {
>> struct device_node *iommu_np;
>> bool has_iommu;
>>
>> - iommu_np = of_parse_phandle(dev->of_node, "iommus", 0);
>> + iommu_np = of_parse_phandle(np, "iommus", 0);
>> has_iommu = iommu_np &&
>> of_device_is_compatible(iommu_np, "xen,grant-dma");
>> of_node_put(iommu_np);
>> @@ -288,9 +305,17 @@ static bool xen_is_dt_grant_dma_device(struct device *dev)
>>
>> bool xen_is_grant_dma_device(struct device *dev)
>> {
>> + struct device_node *np;
>> +
>> /* XXX Handle only DT devices for now */
>> - if (dev->of_node)
>> - return xen_is_dt_grant_dma_device(dev);
>> + np = xen_dt_get_node(dev);
>> + if (np) {
>> + bool ret;
>> +
>> + ret = xen_is_dt_grant_dma_device(np);
>> + of_node_put(np);
>> + return ret;
>> + }
>>
>> return false;
>> }
>> @@ -303,20 +328,20 @@ bool xen_virtio_mem_acc(struct virtio_device *dev)
>> return xen_is_grant_dma_device(dev->dev.parent);
>> }
>>
>> -static int xen_dt_grant_setup_dma_ops(struct device *dev,
>> +static int xen_dt_grant_setup_dma_ops(struct device_node *np,
>> struct xen_grant_dma_data *data)
>> {
>> struct of_phandle_args iommu_spec;
>>
>> - if (of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells",
>> + if (of_parse_phandle_with_args(np, "iommus", "#iommu-cells",
>> 0, &iommu_spec)) {
>> - dev_err(dev, "Cannot parse iommus property\n");
>> + pr_err("%s: Cannot parse iommus property\n", np->name);
>> return -ESRCH;
>> }
>>
>> if (!of_device_is_compatible(iommu_spec.np, "xen,grant-dma") ||
>> iommu_spec.args_count != 1) {
>> - dev_err(dev, "Incompatible IOMMU node\n");
>> + pr_err("%s: Incompatible IOMMU node\n", iommu_spec.np->name);
>> of_node_put(iommu_spec.np);
>> return -ESRCH;
>> }
>> @@ -335,6 +360,7 @@ static int xen_dt_grant_setup_dma_ops(struct device *dev,
>> void xen_grant_setup_dma_ops(struct device *dev)
>> {
>> struct xen_grant_dma_data *data;
>> + struct device_node *np;
>>
>> data = find_xen_grant_dma_data(dev);
>> if (data) {
>> @@ -346,8 +372,13 @@ void xen_grant_setup_dma_ops(struct device *dev)
>> if (!data)
>> goto err;
>>
>> - if (dev->of_node) {
>> - if (xen_dt_grant_setup_dma_ops(dev, data))
>> + np = xen_dt_get_node(dev);
>> + if (np) {
>> + int ret;
>> +
>> + ret = xen_dt_grant_setup_dma_ops(np, data);
>> + of_node_put(np);
>> + if (ret)
>> goto err;
>> } else if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) {
>> dev_info(dev, "Using dom0 as backend\n");
>> --
>> 2.25.1
>>
--
Regards,
Oleksandr Tyshchenko
On Wed, 12 Oct 2022, Oleksandr Tyshchenko wrote:
> > On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
> >> From: Oleksandr Tyshchenko <[email protected]>
> >>
> >> Use the same "xen-grant-dma" device concept (based on generic IOMMU
> >> device-tree bindings) for the PCI devices behind device-tree based
> >> PCI Host controller.
> >>
> >> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
> >> ---
> >> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
> >> on Arm at some point in the future. The Xen toolstack side is not published yet.
> >> Here, for PCI devices we use the same way to pass backend domid to the guest as for
> >> platform devices.
> >>
> >> Depends on Juergen's series:
> >> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!waOk2Goc7qlhNo5-csRObryil_GzMF_e61EJR501oJ08cH2dnJulsZXWlelBDTBqa63TVoUcWQTB5NecJ1p4xFNgh2_EuA$ [lore[.]kernel[.]org]
> >> ---
> >> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
> >> 1 file changed, 41 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
> >> index ff9be3aff87e..79d13122ec08 100644
> >> --- a/drivers/xen/grant-dma-ops.c
> >> +++ b/drivers/xen/grant-dma-ops.c
> >> @@ -10,6 +10,7 @@
> >> #include <linux/module.h>
> >> #include <linux/dma-map-ops.h>
> >> #include <linux/of.h>
> >> +#include <linux/pci.h>
> >> #include <linux/pfn.h>
> >> #include <linux/xarray.h>
> >> #include <linux/virtio_anchor.h>
> >> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
> >> .dma_supported = xen_grant_dma_supported,
> >> };
> >>
> >> -static bool xen_is_dt_grant_dma_device(struct device *dev)
> >> +static struct device_node *xen_dt_get_node(struct device *dev)
> >> +{
> >> + if (dev_is_pci(dev)) {
> >> + struct pci_dev *pdev = to_pci_dev(dev);
> >> + struct pci_bus *bus = pdev->bus;
> >> +
> >> + /* Walk up to the root bus to look for PCI Host controller */
> >> + while (!pci_is_root_bus(bus))
> >> + bus = bus->parent;
> >> +
> >> + return of_node_get(bus->bridge->parent->of_node);
> >> + }
> > Is it possible to have multiple virtio devices under a single virtio-pci
> > root complex?
>
> yes
>
>
> > What if virtio-net has the backend in dom0 and
> > virtio-block has the backend in dom1?
> >
> > Or each virtio PCI device shows up under a different PCI root complex?
>
>
> Good questions. To be honest, it is not 100% clear to me yet how it is
> supposed to be. But let's guess...
>
> I think that having a PCI Host bridge per virtio-pci device is overkill.
>
> So, I see two options here:
> 1. We provide PCI Host bridge per backends domain to the guest, so each
> PCI Host bridge covers only virtio-pci devices whose backends are
> running within *the same* domain.
> With that we would be able to use property at PCI Host bridge level.
>
> 2. We provide only a single PCI Host bridge to the guest, so that single
> PCI Host bridge covers all virtio-pci devices assigned to this guest.
> No matter where the corresponding backends are running (the
> virtio-devices under that PCI Host bridge can have the backends in
> different domains).
> With that we wouldn’t be able to use property at PCI Host bridge level.
> And we need a more flexible option(s) to be able distinguish between
> virtio-pci devices.
>
> Taking into account that for virtio-pci on Arm we need to emulate a
> specific PCI Host bridge in Xen to intercept the guest PCI config space
> accesses
> (detect what PCI device is targeted) and forward them to the appropriate
> backend (IOREQ Server),
> it feels to me that we likely need to go with the second option here
> (one PCI host bridge per guest), I may mistake,
> but I don’t think that we want to emulate several PCI Host bridges for a
> single guest (more code, more resources, etc).
>
>
>
> >
> > If we can have multiple virtio PCI devices under a single PCI root
> > complex, then I think it would be better to check for a per-device
> > property, rather than a single property at the PCI root complex level.
>
> Completely agree.
>
>
> >
> > The first thing that comes to mind is to describe each PCI device under
> > the root complex in device tree. Although it is uncommon (usually only
> > the PCI root complex is described in device tree), it is possible to
> > also describe in device tree all the individual PCI devices under the
> > root complex.
> >
> > Given that the domU device tree is generated by Xen and/or the Xen
> > toolstack, it would be easy to arrange for it to happen.
>
> Technically yes. If we decide to provide only a single PCI Host bridge
> to the guest, we will have have to deal with the virtio-pci devices with
> various backend_domid,
> so we can consider using more flexible property
> “iommu-map”/”iommu-map-mask” specially introduced for such purposes:
> https://www.kernel.org/doc/Documentation/devicetree/bindings/pci/pci-iommu.txt
> I think, with that we would be able to describe, i.e that virtio-pci
> device A connects to stream_id (backend_domid) X and virtio-pci device B
> to stream_id Y,
> and virtio-pci device C to nothing (so is not required to use grants),
> unless I missed something.
>
> I have looked at it and I don’t see at the moment why the idea wouldn’t
> work, but I haven’t experimented with that yet in such context.
I think it would work too
> > That would solve the issue as far as I can tell, but I worry it might
> > not be a good idea because if we rely on the per-device device tree node
> > to be present then it becomes harder to implement virtio hotplug
> > (Virtio hotplug is important to add dom0less support.)
> >
> > Let's say that we create a dom0less domU with an emulated PCI root
> > complex without any devices under it, then after Dom0 is fully booted,
> > we add a virtio-net emulated device. How do we tell the guest what is
> > the backend domain id?
> >
> > Device tree and other firmware tables are not relevant anymore.
> >
> > We could reuse a PCI config space register to expose the backend id.
> > However this solution requires a backend change (QEMU) to expose the
> > backend id via an emulated register for each emulated device.
> >
> > To avoid having to introduce a special config space register in all
> > emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
> > could add a special PCI config space register at the emulated PCI Root
> > Complex level.
> >
> > Basically the workflow would be as follow:
> >
> > - Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
> > - Linux writes to special PCI config space register of the Xen PCI Root
> > Complex the PCI device id (basically the BDF)
> > - The Xen PCI Root Complex emulated by Xen answers by writing back to
> > the same location the backend id (domid of the backend)
> > - Linux reads back the same PCI config space register of the Xen PCI
> > Root Complex and learn the relevant domid
> >
> > What do you think?
>
>
> I think the idea sounds indeed interesting and would probably work, but
> would require guest modifications other than just in drivers/xen (and
> likely the specification changes as well).
> Which ideally of course should be avoided.
> Also I was thinking it would be nice not to diverge much between
> communicating the backend_domid for platform and PCI devices on Arm with
> device tree.
>
> If we managed to re-use generic IOMMU device-tree bindings for
> virtio-mmio, we would likely be able to re-use PCI-IOMMU device-tree
> bindings for virtio-pci,
> at least for boot PCI devices (which are known at the domain creation time).
> The more, the bindings is already present:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml
I think using a special config space register in the root complex would
not be terrible in terms of guest changes because it is easy to
introduce a new root complex driver in Linux and other OSes. The root
complex would still be ECAM compatible so the regular ECAM driver would
still work. A new driver would only be necessary if you want to be able
to access the special config space register.
That said, for sure the fewer changes to the guest the better and I like
the existing xen,grant-dma binding.
> Regarding the hotplug devices, yes it is a valid use-case which should
> be also supported with virtio-pci, I assume the Xenstore could be
> re-used for that purpose if it would be available.
> The Xenstore is available with the usual Dom0 and toolstack, is it
> available with dom0less?
Xenstore is available for dom0less if we have a dom0 running with
xenstored as one of the dom0less domains. We currently rely on it for
Xen PV drivers with dom0less. After dom0 is fully booted, we use "xl
network-attach" to create a vif interface dynamically in the domU.
That is why I was thinking of using virtio hotplug to solve the same
problem with virtio, I was imagining that after dom0 is fully booted we
would do "xl virtio-attach" and create a new virtio interface in the
domU. But I cannot see an easy way to make virtio hotplug work together
with the xen,grant-dma bindings. I think it would be better if we find a
way to make it work without xenstore (because xenstore would be a
safety-certification dependency).
Maybe we need to think outside the box and find another solution that
doesn't rely on hotplug.
For instance, let's say that we expose the virtio devices in device tree
in a dom0less configuration too but with status = "disabled". When dom0
(or backend domain) is up and running it can signal that it is ready.
Maybe if we had a special Xen-specific PCI Root Complex driver in the
guest, it could wait for the Xen signal and then continue PCI probing at
that point honoring xen,grant-dma bindings if present in device tree
even if the devices had status = "disabled" initially.
It looks like that would require many guest changes unfortunately.
As an alternative I wonder, given that Xen emulates the PCI root
complex, if we can reuse one of the PCI link up/down delays for this
instead, like "pcie_wait_for_link". It looks like the wait time is in
millisec while we would need potentially several seconds here but it
might be possible?
Other ideas?
On 13.10.22 03:33, Stefano Stabellini wrote:
Hello Stefano
> On Wed, 12 Oct 2022, Oleksandr Tyshchenko wrote:
>>> On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
>>>> From: Oleksandr Tyshchenko <[email protected]>
>>>>
>>>> Use the same "xen-grant-dma" device concept (based on generic IOMMU
>>>> device-tree bindings) for the PCI devices behind device-tree based
>>>> PCI Host controller.
>>>>
>>>> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
>>>> ---
>>>> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
>>>> on Arm at some point in the future. The Xen toolstack side is not published yet.
>>>> Here, for PCI devices we use the same way to pass backend domid to the guest as for
>>>> platform devices.
>>>>
>>>> Depends on Juergen's series:
>>>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!waOk2Goc7qlhNo5-csRObryil_GzMF_e61EJR501oJ08cH2dnJulsZXWlelBDTBqa63TVoUcWQTB5NecJ1p4xFNgh2_EuA$ [lore[.]kernel[.]org]
>>>> ---
>>>> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
>>>> 1 file changed, 41 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
>>>> index ff9be3aff87e..79d13122ec08 100644
>>>> --- a/drivers/xen/grant-dma-ops.c
>>>> +++ b/drivers/xen/grant-dma-ops.c
>>>> @@ -10,6 +10,7 @@
>>>> #include <linux/module.h>
>>>> #include <linux/dma-map-ops.h>
>>>> #include <linux/of.h>
>>>> +#include <linux/pci.h>
>>>> #include <linux/pfn.h>
>>>> #include <linux/xarray.h>
>>>> #include <linux/virtio_anchor.h>
>>>> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
>>>> .dma_supported = xen_grant_dma_supported,
>>>> };
>>>>
>>>> -static bool xen_is_dt_grant_dma_device(struct device *dev)
>>>> +static struct device_node *xen_dt_get_node(struct device *dev)
>>>> +{
>>>> + if (dev_is_pci(dev)) {
>>>> + struct pci_dev *pdev = to_pci_dev(dev);
>>>> + struct pci_bus *bus = pdev->bus;
>>>> +
>>>> + /* Walk up to the root bus to look for PCI Host controller */
>>>> + while (!pci_is_root_bus(bus))
>>>> + bus = bus->parent;
>>>> +
>>>> + return of_node_get(bus->bridge->parent->of_node);
>>>> + }
>>> Is it possible to have multiple virtio devices under a single virtio-pci
>>> root complex?
>> yes
>>
>>
>>> What if virtio-net has the backend in dom0 and
>>> virtio-block has the backend in dom1?
>>>
>>> Or each virtio PCI device shows up under a different PCI root complex?
>>
>> Good questions. To be honest, it is not 100% clear to me yet how it is
>> supposed to be. But let's guess...
>>
>> I think that having a PCI Host bridge per virtio-pci device is overkill.
>>
>> So, I see two options here:
>> 1. We provide PCI Host bridge per backends domain to the guest, so each
>> PCI Host bridge covers only virtio-pci devices whose backends are
>> running within *the same* domain.
>> With that we would be able to use property at PCI Host bridge level.
>>
>> 2. We provide only a single PCI Host bridge to the guest, so that single
>> PCI Host bridge covers all virtio-pci devices assigned to this guest.
>> No matter where the corresponding backends are running (the
>> virtio-devices under that PCI Host bridge can have the backends in
>> different domains).
>> With that we wouldn’t be able to use property at PCI Host bridge level.
>> And we need a more flexible option(s) to be able distinguish between
>> virtio-pci devices.
>>
>> Taking into account that for virtio-pci on Arm we need to emulate a
>> specific PCI Host bridge in Xen to intercept the guest PCI config space
>> accesses
>> (detect what PCI device is targeted) and forward them to the appropriate
>> backend (IOREQ Server),
>> it feels to me that we likely need to go with the second option here
>> (one PCI host bridge per guest), I may mistake,
>> but I don’t think that we want to emulate several PCI Host bridges for a
>> single guest (more code, more resources, etc).
>>
>>
>>
>>> If we can have multiple virtio PCI devices under a single PCI root
>>> complex, then I think it would be better to check for a per-device
>>> property, rather than a single property at the PCI root complex level.
>> Completely agree.
>>
>>
>>> The first thing that comes to mind is to describe each PCI device under
>>> the root complex in device tree. Although it is uncommon (usually only
>>> the PCI root complex is described in device tree), it is possible to
>>> also describe in device tree all the individual PCI devices under the
>>> root complex.
>>>
>>> Given that the domU device tree is generated by Xen and/or the Xen
>>> toolstack, it would be easy to arrange for it to happen.
>> Technically yes. If we decide to provide only a single PCI Host bridge
>> to the guest, we will have have to deal with the virtio-pci devices with
>> various backend_domid,
>> so we can consider using more flexible property
>> “iommu-map”/”iommu-map-mask” specially introduced for such purposes:
>> https://urldefense.com/v3/__https://www.kernel.org/doc/Documentation/devicetree/bindings/pci/pci-iommu.txt__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmv7zWS0dQ$ [kernel[.]org]
>> I think, with that we would be able to describe, i.e that virtio-pci
>> device A connects to stream_id (backend_domid) X and virtio-pci device B
>> to stream_id Y,
>> and virtio-pci device C to nothing (so is not required to use grants),
>> unless I missed something.
>>
>> I have looked at it and I don’t see at the moment why the idea wouldn’t
>> work, but I haven’t experimented with that yet in such context.
> I think it would work too
I have experimented with that, it works. And I have already created a patch.
https://lore.kernel.org/xen-devel/[email protected]/
What I was thinking is although generic PCI-IOMMU bindings
("xen-grant-dma") wouldn't likely be suitable for *future* hotplug support,
it would allow us to have the working solution on Arm (with a minimal
changes, only drivers/xen/grant-dma-ops.c is touched)
at least for PCI devices which are known at the domain creation time. Of
course, this needs the proper support in the toolstack.
>
>
>>> That would solve the issue as far as I can tell, but I worry it might
>>> not be a good idea because if we rely on the per-device device tree node
>>> to be present then it becomes harder to implement virtio hotplug
>>> (Virtio hotplug is important to add dom0less support.)
>>>
>>> Let's say that we create a dom0less domU with an emulated PCI root
>>> complex without any devices under it, then after Dom0 is fully booted,
>>> we add a virtio-net emulated device. How do we tell the guest what is
>>> the backend domain id?
>>>
>>> Device tree and other firmware tables are not relevant anymore.
>>>
>>> We could reuse a PCI config space register to expose the backend id.
>>> However this solution requires a backend change (QEMU) to expose the
>>> backend id via an emulated register for each emulated device.
>>>
>>> To avoid having to introduce a special config space register in all
>>> emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
>>> could add a special PCI config space register at the emulated PCI Root
>>> Complex level.
>>>
>>> Basically the workflow would be as follow:
>>>
>>> - Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
>>> - Linux writes to special PCI config space register of the Xen PCI Root
>>> Complex the PCI device id (basically the BDF)
>>> - The Xen PCI Root Complex emulated by Xen answers by writing back to
>>> the same location the backend id (domid of the backend)
>>> - Linux reads back the same PCI config space register of the Xen PCI
>>> Root Complex and learn the relevant domid
>>>
>>> What do you think?
>>
>> I think the idea sounds indeed interesting and would probably work, but
>> would require guest modifications other than just in drivers/xen (and
>> likely the specification changes as well).
>> Which ideally of course should be avoided.
>> Also I was thinking it would be nice not to diverge much between
>> communicating the backend_domid for platform and PCI devices on Arm with
>> device tree.
>>
>> If we managed to re-use generic IOMMU device-tree bindings for
>> virtio-mmio, we would likely be able to re-use PCI-IOMMU device-tree
>> bindings for virtio-pci,
>> at least for boot PCI devices (which are known at the domain creation time).
>> The more, the bindings is already present:
>> https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmvpfF8GmA$ [git[.]kernel[.]org]
> I think using a special config space register in the root complex would
> not be terrible in terms of guest changes because it is easy to
> introduce a new root complex driver in Linux and other OSes. The root
> complex would still be ECAM compatible so the regular ECAM driver would
> still work. A new driver would only be necessary if you want to be able
> to access the special config space register.
This needs an additional investigation.
>
> That said, for sure the fewer changes to the guest the better and I like
> the existing xen,grant-dma binding.
>
>
>> Regarding the hotplug devices, yes it is a valid use-case which should
>> be also supported with virtio-pci, I assume the Xenstore could be
>> re-used for that purpose if it would be available.
>> The Xenstore is available with the usual Dom0 and toolstack, is it
>> available with dom0less?
> Xenstore is available for dom0less if we have a dom0 running with
> xenstored as one of the dom0less domains. We currently rely on it for
> Xen PV drivers with dom0less. After dom0 is fully booted, we use "xl
> network-attach" to create a vif interface dynamically in the domU.
>
> That is why I was thinking of using virtio hotplug to solve the same
> problem with virtio, I was imagining that after dom0 is fully booted we
> would do "xl virtio-attach" and create a new virtio interface in the
> domU. But I cannot see an easy way to make virtio hotplug work together
> with the xen,grant-dma bindings. I think it would be better if we find a
> way to make it work without xenstore (because xenstore would be a
> safety-certification dependency).
I got your concern regarding the usage of xenstore in general.
Also I got that hotplug is the only way to get virtual devices (either
PV or virtio) working for the dom0less system, is my understanding correct?
The virtio hotplug could *probably* work together with the
"xen,grant-dma" bindings with some prerequisites (backend domid should
be known in advance),
but I am not quite as I am not too familiar with dom0less details. But
anyway, I will try to describe it...
As far as I know the dom0less DomUs are configured from the device-tree.
So the configuration is known and prepared beforehand.
I may guess that we know in what DomU1 we are going to run the virtio
backends and what PCI devices we are going to assign to DomU2. So when
Xen generating device-tree
for DomU2 it could create proper iommu-map for the PCI Host bridge node.
Although for this to work we would need an ability to configure domain
ID (d->domain_id) via device-tree
and likely reserve some range of domain IDs (to not cross with
max_init_domid). But, it wouldn't be 100% hotplug then.
>
> Maybe we need to think outside the box and find another solution that
> doesn't rely on hotplug.
>
> For instance, let's say that we expose the virtio devices in device tree
> in a dom0less configuration too but with status = "disabled". When dom0
> (or backend domain) is up and running it can signal that it is ready.
> Maybe if we had a special Xen-specific PCI Root Complex driver in the
> guest, it could wait for the Xen signal and then continue PCI probing at
> that point honoring xen,grant-dma bindings if present in device tree
> even if the devices had status = "disabled" initially.
>
> It looks like that would require many guest changes unfortunately.
It looks like yes, also you mentioned "it can signal that it is ready",
the question is by what means (xenstore would be a god fit here...)?
And I haven't seen that virtio-pci devices are described in device-tree
somewhere, only generic PCI host bridge node
is described. The virtio-pci devices will be detected the same way as
usual PCI devices during boot. Unless I missed something.
Regarding the virtio-mmio (platform) devices, yes, we could expose them
with status "disabled", and they won't get probed by default.
To be honest, I have experimented with that, when I was thinking of
possible hotplug for virtio-mmio devices (I know, this sounds uncommon
and strange).
I used Linux feature (CONFIG_OF_DYNAMIC, overlays) to update the
device-tree on running guest, so the toolstack initially inserts
virtio-mmio device nodes for non-boot devices
with status "disabled", and at the runtime, once we receive an event for
example, we change the status to "ok" and the corresponding virtio-mmio
device gets probed.
But again, it is not a 100% hotplug, as we need to pre-allocate memory
range and interrupt in advance (when generating guest device tree).
>
>
> As an alternative I wonder, given that Xen emulates the PCI root
> complex, if we can reuse one of the PCI link up/down delays for this
> instead, like "pcie_wait_for_link". It looks like the wait time is in
> millisec while we would need potentially several seconds here but it
> might be possible?
I am not sure that I understand this alternative idea.
>
> Other ideas?
Another (crazy?) idea is to reuse CONFIG_XEN_VIRTIO_FORCE_GRANT for
dom0less system (I mean without "xen,grant-dma" bindings at all).
If virtio backends are always going to run in Dom0 when we have it up
and running, then it should work as domid == 0 is reserved for Dom0.
If there is a need to run virtio backends in other *backend* domain (for
the domain ID to be always known we could reserve an ID for it, so it
would be a const value),
we could probably introduce something configurable like
CONFIG_XEN_VIRTIO_FORCE_GRANT_BE_DOMID with 0 by default (or cmd line
option).
--
Regards,
Oleksandr Tyshchenko
On Sat, 15 Oct 2022, Oleksandr Tyshchenko wrote:
> On 13.10.22 03:33, Stefano Stabellini wrote:
>
> Hello Stefano
>
> > On Wed, 12 Oct 2022, Oleksandr Tyshchenko wrote:
> >>> On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
> >>>> From: Oleksandr Tyshchenko <[email protected]>
> >>>>
> >>>> Use the same "xen-grant-dma" device concept (based on generic IOMMU
> >>>> device-tree bindings) for the PCI devices behind device-tree based
> >>>> PCI Host controller.
> >>>>
> >>>> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
> >>>> ---
> >>>> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
> >>>> on Arm at some point in the future. The Xen toolstack side is not published yet.
> >>>> Here, for PCI devices we use the same way to pass backend domid to the guest as for
> >>>> platform devices.
> >>>>
> >>>> Depends on Juergen's series:
> >>>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!waOk2Goc7qlhNo5-csRObryil_GzMF_e61EJR501oJ08cH2dnJulsZXWlelBDTBqa63TVoUcWQTB5NecJ1p4xFNgh2_EuA$ [lore[.]kernel[.]org]
> >>>> ---
> >>>> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
> >>>> 1 file changed, 41 insertions(+), 10 deletions(-)
> >>>>
> >>>> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
> >>>> index ff9be3aff87e..79d13122ec08 100644
> >>>> --- a/drivers/xen/grant-dma-ops.c
> >>>> +++ b/drivers/xen/grant-dma-ops.c
> >>>> @@ -10,6 +10,7 @@
> >>>> #include <linux/module.h>
> >>>> #include <linux/dma-map-ops.h>
> >>>> #include <linux/of.h>
> >>>> +#include <linux/pci.h>
> >>>> #include <linux/pfn.h>
> >>>> #include <linux/xarray.h>
> >>>> #include <linux/virtio_anchor.h>
> >>>> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
> >>>> .dma_supported = xen_grant_dma_supported,
> >>>> };
> >>>>
> >>>> -static bool xen_is_dt_grant_dma_device(struct device *dev)
> >>>> +static struct device_node *xen_dt_get_node(struct device *dev)
> >>>> +{
> >>>> + if (dev_is_pci(dev)) {
> >>>> + struct pci_dev *pdev = to_pci_dev(dev);
> >>>> + struct pci_bus *bus = pdev->bus;
> >>>> +
> >>>> + /* Walk up to the root bus to look for PCI Host controller */
> >>>> + while (!pci_is_root_bus(bus))
> >>>> + bus = bus->parent;
> >>>> +
> >>>> + return of_node_get(bus->bridge->parent->of_node);
> >>>> + }
> >>> Is it possible to have multiple virtio devices under a single virtio-pci
> >>> root complex?
> >> yes
> >>
> >>
> >>> What if virtio-net has the backend in dom0 and
> >>> virtio-block has the backend in dom1?
> >>>
> >>> Or each virtio PCI device shows up under a different PCI root complex?
> >>
> >> Good questions. To be honest, it is not 100% clear to me yet how it is
> >> supposed to be. But let's guess...
> >>
> >> I think that having a PCI Host bridge per virtio-pci device is overkill.
> >>
> >> So, I see two options here:
> >> 1. We provide PCI Host bridge per backends domain to the guest, so each
> >> PCI Host bridge covers only virtio-pci devices whose backends are
> >> running within *the same* domain.
> >> With that we would be able to use property at PCI Host bridge level.
> >>
> >> 2. We provide only a single PCI Host bridge to the guest, so that single
> >> PCI Host bridge covers all virtio-pci devices assigned to this guest.
> >> No matter where the corresponding backends are running (the
> >> virtio-devices under that PCI Host bridge can have the backends in
> >> different domains).
> >> With that we wouldn’t be able to use property at PCI Host bridge level.
> >> And we need a more flexible option(s) to be able distinguish between
> >> virtio-pci devices.
> >>
> >> Taking into account that for virtio-pci on Arm we need to emulate a
> >> specific PCI Host bridge in Xen to intercept the guest PCI config space
> >> accesses
> >> (detect what PCI device is targeted) and forward them to the appropriate
> >> backend (IOREQ Server),
> >> it feels to me that we likely need to go with the second option here
> >> (one PCI host bridge per guest), I may mistake,
> >> but I don’t think that we want to emulate several PCI Host bridges for a
> >> single guest (more code, more resources, etc).
> >>
> >>
> >>
> >>> If we can have multiple virtio PCI devices under a single PCI root
> >>> complex, then I think it would be better to check for a per-device
> >>> property, rather than a single property at the PCI root complex level.
> >> Completely agree.
> >>
> >>
> >>> The first thing that comes to mind is to describe each PCI device under
> >>> the root complex in device tree. Although it is uncommon (usually only
> >>> the PCI root complex is described in device tree), it is possible to
> >>> also describe in device tree all the individual PCI devices under the
> >>> root complex.
> >>>
> >>> Given that the domU device tree is generated by Xen and/or the Xen
> >>> toolstack, it would be easy to arrange for it to happen.
> >> Technically yes. If we decide to provide only a single PCI Host bridge
> >> to the guest, we will have have to deal with the virtio-pci devices with
> >> various backend_domid,
> >> so we can consider using more flexible property
> >> “iommu-map”/”iommu-map-mask” specially introduced for such purposes:
> >> https://urldefense.com/v3/__https://www.kernel.org/doc/Documentation/devicetree/bindings/pci/pci-iommu.txt__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmv7zWS0dQ$ [kernel[.]org]
> >> I think, with that we would be able to describe, i.e that virtio-pci
> >> device A connects to stream_id (backend_domid) X and virtio-pci device B
> >> to stream_id Y,
> >> and virtio-pci device C to nothing (so is not required to use grants),
> >> unless I missed something.
> >>
> >> I have looked at it and I don’t see at the moment why the idea wouldn’t
> >> work, but I haven’t experimented with that yet in such context.
> > I think it would work too
>
>
> I have experimented with that, it works. And I have already created a patch.
>
> https://lore.kernel.org/xen-devel/[email protected]/
>
>
> What I was thinking is although generic PCI-IOMMU bindings
> ("xen-grant-dma") wouldn't likely be suitable for *future* hotplug support,
> it would allow us to have the working solution on Arm (with a minimal
> changes, only drivers/xen/grant-dma-ops.c is touched)
> at least for PCI devices which are known at the domain creation time. Of
> course, this needs the proper support in the toolstack.
Yeah, it is hard to argue against this, as we don't have a good
alternative :-)
> >>> That would solve the issue as far as I can tell, but I worry it might
> >>> not be a good idea because if we rely on the per-device device tree node
> >>> to be present then it becomes harder to implement virtio hotplug
> >>> (Virtio hotplug is important to add dom0less support.)
> >>>
> >>> Let's say that we create a dom0less domU with an emulated PCI root
> >>> complex without any devices under it, then after Dom0 is fully booted,
> >>> we add a virtio-net emulated device. How do we tell the guest what is
> >>> the backend domain id?
> >>>
> >>> Device tree and other firmware tables are not relevant anymore.
> >>>
> >>> We could reuse a PCI config space register to expose the backend id.
> >>> However this solution requires a backend change (QEMU) to expose the
> >>> backend id via an emulated register for each emulated device.
> >>>
> >>> To avoid having to introduce a special config space register in all
> >>> emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
> >>> could add a special PCI config space register at the emulated PCI Root
> >>> Complex level.
> >>>
> >>> Basically the workflow would be as follow:
> >>>
> >>> - Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
> >>> - Linux writes to special PCI config space register of the Xen PCI Root
> >>> Complex the PCI device id (basically the BDF)
> >>> - The Xen PCI Root Complex emulated by Xen answers by writing back to
> >>> the same location the backend id (domid of the backend)
> >>> - Linux reads back the same PCI config space register of the Xen PCI
> >>> Root Complex and learn the relevant domid
> >>>
> >>> What do you think?
> >>
> >> I think the idea sounds indeed interesting and would probably work, but
> >> would require guest modifications other than just in drivers/xen (and
> >> likely the specification changes as well).
> >> Which ideally of course should be avoided.
> >> Also I was thinking it would be nice not to diverge much between
> >> communicating the backend_domid for platform and PCI devices on Arm with
> >> device tree.
> >>
> >> If we managed to re-use generic IOMMU device-tree bindings for
> >> virtio-mmio, we would likely be able to re-use PCI-IOMMU device-tree
> >> bindings for virtio-pci,
> >> at least for boot PCI devices (which are known at the domain creation time).
> >> The more, the bindings is already present:
> >> https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmvpfF8GmA$ [git[.]kernel[.]org]
> > I think using a special config space register in the root complex would
> > not be terrible in terms of guest changes because it is easy to
> > introduce a new root complex driver in Linux and other OSes. The root
> > complex would still be ECAM compatible so the regular ECAM driver would
> > still work. A new driver would only be necessary if you want to be able
> > to access the special config space register.
>
>
> This needs an additional investigation.
>
>
> >
> > That said, for sure the fewer changes to the guest the better and I like
> > the existing xen,grant-dma binding.
> >
> >
> >> Regarding the hotplug devices, yes it is a valid use-case which should
> >> be also supported with virtio-pci, I assume the Xenstore could be
> >> re-used for that purpose if it would be available.
> >> The Xenstore is available with the usual Dom0 and toolstack, is it
> >> available with dom0less?
> > Xenstore is available for dom0less if we have a dom0 running with
> > xenstored as one of the dom0less domains. We currently rely on it for
> > Xen PV drivers with dom0less. After dom0 is fully booted, we use "xl
> > network-attach" to create a vif interface dynamically in the domU.
> >
> > That is why I was thinking of using virtio hotplug to solve the same
> > problem with virtio, I was imagining that after dom0 is fully booted we
> > would do "xl virtio-attach" and create a new virtio interface in the
> > domU. But I cannot see an easy way to make virtio hotplug work together
> > with the xen,grant-dma bindings. I think it would be better if we find a
> > way to make it work without xenstore (because xenstore would be a
> > safety-certification dependency).
>
> I got your concern regarding the usage of xenstore in general.
> Also I got that hotplug is the only way to get virtual devices (either
> PV or virtio) working for the dom0less system, is my understanding correct?
In a dom0less system domains boot in parallel. The backend is typically
in a larger and slower domain to boot (Linux). So to solve the problem
with Xen PV drivers, we hotplug Xen PV devices after dom0 is booted ("xl
network-attach" for instance).
With virtio, we could either do the same (use virtio hotplug) or find a
way to tell the frontend to delay initialization. The point is that we
don't want to frontend to try to access backend resources before the
backend is up and running.
> The virtio hotplug could *probably* work together with the
> "xen,grant-dma" bindings with some prerequisites (backend domid should
> be known in advance),
> but I am not quite as I am not too familiar with dom0less details. But
> anyway, I will try to describe it...
>
> As far as I know the dom0less DomUs are configured from the device-tree.
> So the configuration is known and prepared beforehand.
> I may guess that we know in what DomU1 we are going to run the virtio
> backends and what PCI devices we are going to assign to DomU2. So when
> Xen generating device-tree
> for DomU2 it could create proper iommu-map for the PCI Host bridge node.
> Although for this to work we would need an ability to configure domain
> ID (d->domain_id) via device-tree
> and likely reserve some range of domain IDs (to not cross with
> max_init_domid). But, it wouldn't be 100% hotplug then.
That's fine it doesn't have to be 100% hotplug. In reality, this is a
static configuration so we know all the information beforehand (which VM
is the backend, which is the frontend, which devices are
shared/emulated). The only issue is that we need a way to tell VM2 to
wait for the backend in VM1 to come online. But the device tree could
contain all information from the start.
> > Maybe we need to think outside the box and find another solution that
> > doesn't rely on hotplug.
> >
> > For instance, let's say that we expose the virtio devices in device tree
> > in a dom0less configuration too but with status = "disabled". When dom0
> > (or backend domain) is up and running it can signal that it is ready.
> > Maybe if we had a special Xen-specific PCI Root Complex driver in the
> > guest, it could wait for the Xen signal and then continue PCI probing at
> > that point honoring xen,grant-dma bindings if present in device tree
> > even if the devices had status = "disabled" initially.
> >
> > It looks like that would require many guest changes unfortunately.
>
>
> It looks like yes, also you mentioned "it can signal that it is ready",
> the question is by what means (xenstore would be a god fit here...)?
Maybe xenstore, yes. The problem is that we want something that works
with minimal drivers changes, and the problem is that if we present the
virtio devices in device tree from boot, the drivers will try to probe
them immediately. We need a way to delay that.
> And I haven't seen that virtio-pci devices are described in device-tree
> somewhere, only generic PCI host bridge node
> is described. The virtio-pci devices will be detected the same way as
> usual PCI devices during boot. Unless I missed something.
Yes exactly, and that is the problem. How do we make those driver "wait"
before probing.
> Regarding the virtio-mmio (platform) devices, yes, we could expose them
> with status "disabled", and they won't get probed by default.
> To be honest, I have experimented with that, when I was thinking of
> possible hotplug for virtio-mmio devices (I know, this sounds uncommon
> and strange).
> I used Linux feature (CONFIG_OF_DYNAMIC, overlays) to update the
> device-tree on running guest, so the toolstack initially inserts
> virtio-mmio device nodes for non-boot devices
> with status "disabled", and at the runtime, once we receive an event for
> example, we change the status to "ok" and the corresponding virtio-mmio
> device gets probed.
> But again, it is not a 100% hotplug, as we need to pre-allocate memory
> range and interrupt in advance (when generating guest device tree).
Actually this is really cool! Does it work? It doesn't matter to me if
the virtio devices are pci or mmio as long as we can solve the "wait"
problem. So this could be a good solution.
> > As an alternative I wonder, given that Xen emulates the PCI root
> > complex, if we can reuse one of the PCI link up/down delays for this
> > instead, like "pcie_wait_for_link". It looks like the wait time is in
> > millisec while we would need potentially several seconds here but it
> > might be possible?
>
> I am not sure that I understand this alternative idea.
The PCI subsystem has already a concept of wait times. Just have a look
at pcie_wait_for_link under drivers/pci. The question was whether we can
find a way to reuse one of the existing wait times to deal with our
"wait" problem.
> >
> > Other ideas?
>
> Another (crazy?) idea is to reuse CONFIG_XEN_VIRTIO_FORCE_GRANT for
> dom0less system (I mean without "xen,grant-dma" bindings at all).
> If virtio backends are always going to run in Dom0 when we have it up
> and running, then it should work as domid == 0 is reserved for Dom0.
> If there is a need to run virtio backends in other *backend* domain (for
> the domain ID to be always known we could reserve an ID for it, so it
> would be a const value),
> we could probably introduce something configurable like
> CONFIG_XEN_VIRTIO_FORCE_GRANT_BE_DOMID with 0 by default (or cmd line
> option).
The problem in a dom0less system is not much how to tell which is the
backend domid, because that is known in advance and could be added to
device tree at boot somehow. The issue is how to ask the frontend to
"wait" and then how to tell the frontend to "proceed" after the backend
comes online.
On 17.10.22 23:39, Stefano Stabellini wrote:
Hello Stefano
> On Sat, 15 Oct 2022, Oleksandr Tyshchenko wrote:
>> On 13.10.22 03:33, Stefano Stabellini wrote:
>>
>> Hello Stefano
>>
>>> On Wed, 12 Oct 2022, Oleksandr Tyshchenko wrote:
>>>>> On Thu, 6 Oct 2022, Oleksandr Tyshchenko wrote:
>>>>>> From: Oleksandr Tyshchenko <[email protected]>
>>>>>>
>>>>>> Use the same "xen-grant-dma" device concept (based on generic IOMMU
>>>>>> device-tree bindings) for the PCI devices behind device-tree based
>>>>>> PCI Host controller.
>>>>>>
>>>>>> Signed-off-by: Oleksandr Tyshchenko <[email protected]>
>>>>>> ---
>>>>>> Slightly RFC. This is needed to support Xen grant mappings for virtio-pci devices
>>>>>> on Arm at some point in the future. The Xen toolstack side is not published yet.
>>>>>> Here, for PCI devices we use the same way to pass backend domid to the guest as for
>>>>>> platform devices.
>>>>>>
>>>>>> Depends on Juergen's series:
>>>>>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!waOk2Goc7qlhNo5-csRObryil_GzMF_e61EJR501oJ08cH2dnJulsZXWlelBDTBqa63TVoUcWQTB5NecJ1p4xFNgh2_EuA$ [lore[.]kernel[.]org]
>>>>>> ---
>>>>>> drivers/xen/grant-dma-ops.c | 51 +++++++++++++++++++++++++++++--------
>>>>>> 1 file changed, 41 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
>>>>>> index ff9be3aff87e..79d13122ec08 100644
>>>>>> --- a/drivers/xen/grant-dma-ops.c
>>>>>> +++ b/drivers/xen/grant-dma-ops.c
>>>>>> @@ -10,6 +10,7 @@
>>>>>> #include <linux/module.h>
>>>>>> #include <linux/dma-map-ops.h>
>>>>>> #include <linux/of.h>
>>>>>> +#include <linux/pci.h>
>>>>>> #include <linux/pfn.h>
>>>>>> #include <linux/xarray.h>
>>>>>> #include <linux/virtio_anchor.h>
>>>>>> @@ -273,12 +274,28 @@ static const struct dma_map_ops xen_grant_dma_ops = {
>>>>>> .dma_supported = xen_grant_dma_supported,
>>>>>> };
>>>>>>
>>>>>> -static bool xen_is_dt_grant_dma_device(struct device *dev)
>>>>>> +static struct device_node *xen_dt_get_node(struct device *dev)
>>>>>> +{
>>>>>> + if (dev_is_pci(dev)) {
>>>>>> + struct pci_dev *pdev = to_pci_dev(dev);
>>>>>> + struct pci_bus *bus = pdev->bus;
>>>>>> +
>>>>>> + /* Walk up to the root bus to look for PCI Host controller */
>>>>>> + while (!pci_is_root_bus(bus))
>>>>>> + bus = bus->parent;
>>>>>> +
>>>>>> + return of_node_get(bus->bridge->parent->of_node);
>>>>>> + }
>>>>> Is it possible to have multiple virtio devices under a single virtio-pci
>>>>> root complex?
>>>> yes
>>>>
>>>>
>>>>> What if virtio-net has the backend in dom0 and
>>>>> virtio-block has the backend in dom1?
>>>>>
>>>>> Or each virtio PCI device shows up under a different PCI root complex?
>>>> Good questions. To be honest, it is not 100% clear to me yet how it is
>>>> supposed to be. But let's guess...
>>>>
>>>> I think that having a PCI Host bridge per virtio-pci device is overkill.
>>>>
>>>> So, I see two options here:
>>>> 1. We provide PCI Host bridge per backends domain to the guest, so each
>>>> PCI Host bridge covers only virtio-pci devices whose backends are
>>>> running within *the same* domain.
>>>> With that we would be able to use property at PCI Host bridge level.
>>>>
>>>> 2. We provide only a single PCI Host bridge to the guest, so that single
>>>> PCI Host bridge covers all virtio-pci devices assigned to this guest.
>>>> No matter where the corresponding backends are running (the
>>>> virtio-devices under that PCI Host bridge can have the backends in
>>>> different domains).
>>>> With that we wouldn’t be able to use property at PCI Host bridge level.
>>>> And we need a more flexible option(s) to be able distinguish between
>>>> virtio-pci devices.
>>>>
>>>> Taking into account that for virtio-pci on Arm we need to emulate a
>>>> specific PCI Host bridge in Xen to intercept the guest PCI config space
>>>> accesses
>>>> (detect what PCI device is targeted) and forward them to the appropriate
>>>> backend (IOREQ Server),
>>>> it feels to me that we likely need to go with the second option here
>>>> (one PCI host bridge per guest), I may mistake,
>>>> but I don’t think that we want to emulate several PCI Host bridges for a
>>>> single guest (more code, more resources, etc).
>>>>
>>>>
>>>>
>>>>> If we can have multiple virtio PCI devices under a single PCI root
>>>>> complex, then I think it would be better to check for a per-device
>>>>> property, rather than a single property at the PCI root complex level.
>>>> Completely agree.
>>>>
>>>>
>>>>> The first thing that comes to mind is to describe each PCI device under
>>>>> the root complex in device tree. Although it is uncommon (usually only
>>>>> the PCI root complex is described in device tree), it is possible to
>>>>> also describe in device tree all the individual PCI devices under the
>>>>> root complex.
>>>>>
>>>>> Given that the domU device tree is generated by Xen and/or the Xen
>>>>> toolstack, it would be easy to arrange for it to happen.
>>>> Technically yes. If we decide to provide only a single PCI Host bridge
>>>> to the guest, we will have have to deal with the virtio-pci devices with
>>>> various backend_domid,
>>>> so we can consider using more flexible property
>>>> “iommu-map”/”iommu-map-mask” specially introduced for such purposes:
>>>> https://urldefense.com/v3/__https://www.kernel.org/doc/Documentation/devicetree/bindings/pci/pci-iommu.txt__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmv7zWS0dQ$ [kernel[.]org]
>>>> I think, with that we would be able to describe, i.e that virtio-pci
>>>> device A connects to stream_id (backend_domid) X and virtio-pci device B
>>>> to stream_id Y,
>>>> and virtio-pci device C to nothing (so is not required to use grants),
>>>> unless I missed something.
>>>>
>>>> I have looked at it and I don’t see at the moment why the idea wouldn’t
>>>> work, but I haven’t experimented with that yet in such context.
>>> I think it would work too
>>
>> I have experimented with that, it works. And I have already created a patch.
>>
>> https://urldefense.com/v3/__https://lore.kernel.org/xen-devel/[email protected]/__;!!GF_29dbcQIUBPA!xBJlE6fmbV_XjVwm0QBIL4YTCJOoHu337mPEYWO96QBymp8uNoQpaxWYgq3oRzfVVA0IO9Kz77hebIkDsPYqGHurtMrVMA$ [lore[.]kernel[.]org]
>>
>>
>> What I was thinking is although generic PCI-IOMMU bindings
>> ("xen-grant-dma") wouldn't likely be suitable for *future* hotplug support,
>> it would allow us to have the working solution on Arm (with a minimal
>> changes, only drivers/xen/grant-dma-ops.c is touched)
>> at least for PCI devices which are known at the domain creation time. Of
>> course, this needs the proper support in the toolstack.
> Yeah, it is hard to argue against this, as we don't have a good
> alternative :-)
>
>
>>>>> That would solve the issue as far as I can tell, but I worry it might
>>>>> not be a good idea because if we rely on the per-device device tree node
>>>>> to be present then it becomes harder to implement virtio hotplug
>>>>> (Virtio hotplug is important to add dom0less support.)
>>>>>
>>>>> Let's say that we create a dom0less domU with an emulated PCI root
>>>>> complex without any devices under it, then after Dom0 is fully booted,
>>>>> we add a virtio-net emulated device. How do we tell the guest what is
>>>>> the backend domain id?
>>>>>
>>>>> Device tree and other firmware tables are not relevant anymore.
>>>>>
>>>>> We could reuse a PCI config space register to expose the backend id.
>>>>> However this solution requires a backend change (QEMU) to expose the
>>>>> backend id via an emulated register for each emulated device.
>>>>>
>>>>> To avoid having to introduce a special config space register in all
>>>>> emulated PCI devices (virtio-net, virtio-block, etc) I wonder if we
>>>>> could add a special PCI config space register at the emulated PCI Root
>>>>> Complex level.
>>>>>
>>>>> Basically the workflow would be as follow:
>>>>>
>>>>> - Linux recognizes the PCI Root Complex as a Xen PCI Root Complex
>>>>> - Linux writes to special PCI config space register of the Xen PCI Root
>>>>> Complex the PCI device id (basically the BDF)
>>>>> - The Xen PCI Root Complex emulated by Xen answers by writing back to
>>>>> the same location the backend id (domid of the backend)
>>>>> - Linux reads back the same PCI config space register of the Xen PCI
>>>>> Root Complex and learn the relevant domid
>>>>>
>>>>> What do you think?
>>>> I think the idea sounds indeed interesting and would probably work, but
>>>> would require guest modifications other than just in drivers/xen (and
>>>> likely the specification changes as well).
>>>> Which ideally of course should be avoided.
>>>> Also I was thinking it would be nice not to diverge much between
>>>> communicating the backend_domid for platform and PCI devices on Arm with
>>>> device tree.
>>>>
>>>> If we managed to re-use generic IOMMU device-tree bindings for
>>>> virtio-mmio, we would likely be able to re-use PCI-IOMMU device-tree
>>>> bindings for virtio-pci,
>>>> at least for boot PCI devices (which are known at the domain creation time).
>>>> The more, the bindings is already present:
>>>> https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/devicetree/bindings/iommu/xen,grant-dma.yaml__;!!GF_29dbcQIUBPA!17Dk_s8c_5QCQDmLM1FYp695LuUMSTrUV6HPfRA9BgmVz9TRRDZBOuzsdZw_c6G0ogky1P11gql9CHAOshukWmvpfF8GmA$ [git[.]kernel[.]org]
>>> I think using a special config space register in the root complex would
>>> not be terrible in terms of guest changes because it is easy to
>>> introduce a new root complex driver in Linux and other OSes. The root
>>> complex would still be ECAM compatible so the regular ECAM driver would
>>> still work. A new driver would only be necessary if you want to be able
>>> to access the special config space register.
>>
>> This needs an additional investigation.
>>
>>
>>> That said, for sure the fewer changes to the guest the better and I like
>>> the existing xen,grant-dma binding.
>>>
>>>
>>>> Regarding the hotplug devices, yes it is a valid use-case which should
>>>> be also supported with virtio-pci, I assume the Xenstore could be
>>>> re-used for that purpose if it would be available.
>>>> The Xenstore is available with the usual Dom0 and toolstack, is it
>>>> available with dom0less?
>>> Xenstore is available for dom0less if we have a dom0 running with
>>> xenstored as one of the dom0less domains. We currently rely on it for
>>> Xen PV drivers with dom0less. After dom0 is fully booted, we use "xl
>>> network-attach" to create a vif interface dynamically in the domU.
>>>
>>> That is why I was thinking of using virtio hotplug to solve the same
>>> problem with virtio, I was imagining that after dom0 is fully booted we
>>> would do "xl virtio-attach" and create a new virtio interface in the
>>> domU. But I cannot see an easy way to make virtio hotplug work together
>>> with the xen,grant-dma bindings. I think it would be better if we find a
>>> way to make it work without xenstore (because xenstore would be a
>>> safety-certification dependency).
>> I got your concern regarding the usage of xenstore in general.
>> Also I got that hotplug is the only way to get virtual devices (either
>> PV or virtio) working for the dom0less system, is my understanding correct?
> In a dom0less system domains boot in parallel. The backend is typically
> in a larger and slower domain to boot (Linux). So to solve the problem
> with Xen PV drivers, we hotplug Xen PV devices after dom0 is booted ("xl
> network-attach" for instance).
>
> With virtio, we could either do the same (use virtio hotplug) or find a
> way to tell the frontend to delay initialization. The point is that we
> don't want to frontend to try to access backend resources before the
> backend is up and running.
Now I got the problem. Thank you for the explanation.
>
>
>> The virtio hotplug could *probably* work together with the
>> "xen,grant-dma" bindings with some prerequisites (backend domid should
>> be known in advance),
>> but I am not quite as I am not too familiar with dom0less details. But
>> anyway, I will try to describe it...
>>
>> As far as I know the dom0less DomUs are configured from the device-tree.
>> So the configuration is known and prepared beforehand.
>> I may guess that we know in what DomU1 we are going to run the virtio
>> backends and what PCI devices we are going to assign to DomU2. So when
>> Xen generating device-tree
>> for DomU2 it could create proper iommu-map for the PCI Host bridge node.
>> Although for this to work we would need an ability to configure domain
>> ID (d->domain_id) via device-tree
>> and likely reserve some range of domain IDs (to not cross with
>> max_init_domid). But, it wouldn't be 100% hotplug then.
> That's fine it doesn't have to be 100% hotplug. In reality, this is a
> static configuration so we know all the information beforehand (which VM
> is the backend, which is the frontend, which devices are
> shared/emulated). The only issue is that we need a way to tell VM2 to
> wait for the backend in VM1 to come online. But the device tree could
> contain all information from the start.
great, it is good that device tree could contain all information from
the start,
I think this will simplify things.
>
>
>>> Maybe we need to think outside the box and find another solution that
>>> doesn't rely on hotplug.
>>>
>>> For instance, let's say that we expose the virtio devices in device tree
>>> in a dom0less configuration too but with status = "disabled". When dom0
>>> (or backend domain) is up and running it can signal that it is ready.
>>> Maybe if we had a special Xen-specific PCI Root Complex driver in the
>>> guest, it could wait for the Xen signal and then continue PCI probing at
>>> that point honoring xen,grant-dma bindings if present in device tree
>>> even if the devices had status = "disabled" initially.
>>>
>>> It looks like that would require many guest changes unfortunately.
>>
>> It looks like yes, also you mentioned "it can signal that it is ready",
>> the question is by what means (xenstore would be a god fit here...)?
> Maybe xenstore, yes. The problem is that we want something that works
> with minimal drivers changes, and the problem is that if we present the
> virtio devices in device tree from boot, the drivers will try to probe
> them immediately. We need a way to delay that.
agree
>
>
>> And I haven't seen that virtio-pci devices are described in device-tree
>> somewhere, only generic PCI host bridge node
>> is described. The virtio-pci devices will be detected the same way as
>> usual PCI devices during boot. Unless I missed something.
> Yes exactly, and that is the problem. How do we make those driver "wait"
> before probing.
ok, so ...
>
>
>> Regarding the virtio-mmio (platform) devices, yes, we could expose them
>> with status "disabled", and they won't get probed by default.
>> To be honest, I have experimented with that, when I was thinking of
>> possible hotplug for virtio-mmio devices (I know, this sounds uncommon
>> and strange).
>> I used Linux feature (CONFIG_OF_DYNAMIC, overlays) to update the
>> device-tree on running guest, so the toolstack initially inserts
>> virtio-mmio device nodes for non-boot devices
>> with status "disabled", and at the runtime, once we receive an event for
>> example, we change the status to "ok" and the corresponding virtio-mmio
>> device gets probed.
>> But again, it is not a 100% hotplug, as we need to pre-allocate memory
>> range and interrupt in advance (when generating guest device tree).
> Actually this is really cool! Does it work? It doesn't matter to me if
> the virtio devices are pci or mmio as long as we can solve the "wait"
> problem. So this could be a good solution.
... yes, it does. Initially I experimented with virtio-mmio devices, but
today I tried with PCI host bridge as well.
I won't describe the commands which I used to apply/remove device-tree
overlays from the userspace as well as the context of
dtso files I created, I will describe how that could be done from the
kernel by using existing functionality (CONFIG_OF_DYNAMIC).
As I said if we exposed the devices with status "disabled", they
wouldn't get probed by default. Once we receive an signal
that otherend is ready, we change the status to "ok" and the
corresponding device gets probed.
So below the test patch, which just change the status of the required
device-tree node (as you can see the code to update the property is
simple enough),
I hacked "xl sysrq" for the convenience of testing.
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index 045c1805b2d5..9683ce075bc9 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/dma-map-ops.h>
#include <linux/of.h>
+#include <linux/of_platform.h>
#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/xarray.h>
@@ -379,6 +380,108 @@ bool xen_is_grant_dma_device(struct device *dev)
return false;
}
+/* TODO: Consider using statically allocated (struct property status) */
+static int xen_grant_dma_enable_device(struct device_node *np)
+{
+ struct property *status;
+
+ status = kzalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return -ENOMEM;
+
+ status->name = kstrdup("status", GFP_KERNEL);
+ if (!status->name)
+ return -ENOMEM;
+
+ status->value = kstrdup("okay", GFP_KERNEL);
+ if (!status->value)
+ return -ENOMEM;
+
+ status->length = sizeof("okay");
+
+ return of_update_property(np, status);
+}
+
+static int xen_grant_dma_disable_device(struct device_node *np)
+{
+ struct property *status;
+
+ status = kzalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return -ENOMEM;
+
+ status->name = kstrdup("status", GFP_KERNEL);
+ if (!status->name)
+ return -ENOMEM;
+
+ status->value = kstrdup("disabled", GFP_KERNEL);
+ if (!status->value)
+ return -ENOMEM;
+
+ status->length = sizeof("disabled");
+
+ return of_update_property(np, status);
+}
+
+void xen_grant_dma_handle_sysrq(int key)
+{
+ struct device_node *np;
+ const char *path;
+ bool enable;
+
+ printk("%s: got key %d\n", __func__, key);
+
+ switch (key) {
+ case '0':
+ path = "/virtio@2000000";
+ enable = true;
+ break;
+
+ case '1':
+ path = "/virtio@2000200";
+ enable = true;
+ break;
+
+ case '2':
+ path = "/virtio@2000000";
+ enable = false;
+ break;
+
+ case '3':
+ path = "/virtio@2000200";
+ enable = false;
+ break;
+
+ case '4':
+ path = "/pcie@10000000";
+ enable = true;
+ break;
+
+ case '5':
+ path = "/pcie@10000000";
+ enable = false;
+ break;
+
+ default:
+ printk("%s: wrong key %d\n", __func__, key);
+ return;
+ }
+
+ np = of_find_node_by_path(path);
+ if (!np) {
+ printk("%s: failed to find node by path %s\n", __func__,
path);
+ return;
+ }
+
+ if (enable) {
+ xen_grant_dma_enable_device(np);
+ printk("%s: enable %s\n", __func__, path);
+ } else {
+ xen_grant_dma_disable_device(np);
+ printk("%s: disable %s\n", __func__, path);
+ }
+}
+
bool xen_virtio_mem_acc(struct virtio_device *dev)
{
if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain())
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index c16df629907e..6df96be1ea40 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -308,7 +308,8 @@ static void sysrq_handler(struct xenbus_watch
*watch, const char *path,
goto again;
if (sysrq_key != '\0')
- handle_sysrq(sysrq_key);
+ /*handle_sysrq(sysrq_key);*/
+ xen_grant_dma_handle_sysrq(sysrq_key);
}
static struct xenbus_watch sysrq_watch = {
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index a34f4271a2e9..c2da1bc24091 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -215,6 +215,8 @@ static inline void xen_preemptible_hcall_end(void) { }
#endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
+void xen_grant_dma_handle_sysrq(int key);
+
#ifdef CONFIG_XEN_GRANT_DMA_OPS
void xen_grant_setup_dma_ops(struct device *dev);
bool xen_is_grant_dma_device(struct device *dev);
(END)
So how it looks like:
1. DomU boots without PCI Host bridge probed. So nothing PCI related is
observed in DomU.
cat /proc/device-tree/pcie@10000000/status
disabled
2. I run backends in DomD and after that issue a signal to "enable"
root@generic-armv8-xt-dom0:~# xl sysrq DomU 4
3. The PCI Host bridge is probed, and all required PCI devices are
discovered
root@generic-armv8-xt-dom0:~# xl console DomU
[ 237.407620] xen_grant_dma_handle_sysrq: got key 52
[ 237.408133] pci-host-generic 10000000.pcie: host bridge
/pcie@10000000 ranges:
[ 237.408186] pci-host-generic 10000000.pcie: MEM
0x0023000000..0x0032ffffff -> 0x0023000000
[ 237.408231] pci-host-generic 10000000.pcie: MEM
0x0100000000..0x01ffffffff -> 0x0100000000
[ 237.408313] pci-host-generic 10000000.pcie: ECAM at [mem
0x10000000-0x1fffffff] for [bus 00-ff]
[ 237.408451] pci-host-generic 10000000.pcie: PCI host bridge to bus
0000:00
[ 237.408490] pci_bus 0000:00: root bus resource [bus 00-ff]
[ 237.408517] pci_bus 0000:00: root bus resource [mem
0x23000000-0x32ffffff]
[ 237.408545] pci_bus 0000:00: root bus resource [mem
0x100000000-0x1ffffffff pref]
[ 237.409043] pci 0000:00:00.0: [1b36:0008] type 00 class 0x060000
[ 237.458045] pci 0000:00:01.0: [1af4:1041] type 00 class 0x020000
[ 237.502588] pci 0000:00:01.0: reg 0x20: [mem 0x00000000-0x00003fff
64bit pref]
[ 237.507475] pci 0000:00:02.0: [1af4:1042] type 00 class 0x010000
[ 237.552706] pci 0000:00:02.0: reg 0x20: [mem 0x00000000-0x00003fff
64bit pref]
[ 237.559847] pci 0000:00:01.0: BAR 4: assigned [mem
0x100000000-0x100003fff 64bit pref]
[ 237.560411] pci 0000:00:02.0: BAR 4: assigned [mem
0x100004000-0x100007fff 64bit pref]
[ 237.563324] virtio-pci 0000:00:01.0: Set up Xen grant DMA ops (rid
0x8 sid 0x1)
[ 237.564833] virtio-pci 0000:00:01.0: enabling device (0000 -> 0002)
[ 237.582734] virtio-pci 0000:00:02.0: Set up Xen grant DMA ops (rid
0x10 sid 0x1)
[ 237.583413] virtio-pci 0000:00:02.0: enabling device (0000 -> 0002)
[ 237.595712] virtio_blk virtio1: 4/0/0 default/read/poll queues
[ 237.596227] virtio_net virtio0 enp0s1: renamed from eth1
[ 237.602499] virtio_blk virtio1: [vda] 4096000 512-byte logical blocks
(2.10 GB/1.95 GiB)
[ 237.606317] xen_grant_dma_handle_sysrq: enable /pcie@10000000
4. The same way the pseudo-hotremove would work (if we change the status
to "disabled" the corresponding device gets removed)
If this pseudo-hotplug sounds appropriate for the dom0less, the one of
the next questions would be what
mechanism to use for signalling (event, xenstore, whatever). Note that
signal should only be sent
if all backends which serve virtio-pci devices within that PCI Host
bridge are ready.
>
>
>>> As an alternative I wonder, given that Xen emulates the PCI root
>>> complex, if we can reuse one of the PCI link up/down delays for this
>>> instead, like "pcie_wait_for_link". It looks like the wait time is in
>>> millisec while we would need potentially several seconds here but it
>>> might be possible?
>> I am not sure that I understand this alternative idea.
> The PCI subsystem has already a concept of wait times. Just have a look
> at pcie_wait_for_link under drivers/pci. The question was whether we can
> find a way to reuse one of the existing wait times to deal with our
> "wait" problem.
Thank you for the explanation. Even if works it would be only suitable
for virtio-pci
(for the virtio-mmio we would need to find another solution).
>
>
>>> Other ideas?
>> Another (crazy?) idea is to reuse CONFIG_XEN_VIRTIO_FORCE_GRANT for
>> dom0less system (I mean without "xen,grant-dma" bindings at all).
>> If virtio backends are always going to run in Dom0 when we have it up
>> and running, then it should work as domid == 0 is reserved for Dom0.
>> If there is a need to run virtio backends in other *backend* domain (for
>> the domain ID to be always known we could reserve an ID for it, so it
>> would be a const value),
>> we could probably introduce something configurable like
>> CONFIG_XEN_VIRTIO_FORCE_GRANT_BE_DOMID with 0 by default (or cmd line
>> option).
> The problem in a dom0less system is not much how to tell which is the
> backend domid, because that is known in advance and could be added to
> device tree at boot somehow. The issue is how to ask the frontend to
> "wait" and then how to tell the frontend to "proceed" after the backend
> comes online.
please see above.
To summarize:
1. For normal case there is no problem with communicating the backend
domid on Arm with device-tree (neither for virtio-mmio nor for virtio-pci),
for the virtio-pci the V2 (PCI-IOMMU bindings) should be used. For the
dom0less there won't be problem also as I understood from the discussion
(as the configuration is known in advance).
So I propose to concentrate on V2.
2. The problem is in supporting virtio for the dom0less in general
despite whether it is a foreign or grant mappings.
Here we would need a (pseudo-)hotplug or some other method to start
operating only when backend is available.
--
Regards,
Oleksandr Tyshchenko
+ Vikram
On Wed, 19 Oct 2022, Oleksandr Tyshchenko wrote:
> >> Regarding the virtio-mmio (platform) devices, yes, we could expose them
> >> with status "disabled", and they won't get probed by default.
> >> To be honest, I have experimented with that, when I was thinking of
> >> possible hotplug for virtio-mmio devices (I know, this sounds uncommon
> >> and strange).
> >> I used Linux feature (CONFIG_OF_DYNAMIC, overlays) to update the
> >> device-tree on running guest, so the toolstack initially inserts
> >> virtio-mmio device nodes for non-boot devices
> >> with status "disabled", and at the runtime, once we receive an event for
> >> example, we change the status to "ok" and the corresponding virtio-mmio
> >> device gets probed.
> >> But again, it is not a 100% hotplug, as we need to pre-allocate memory
> >> range and interrupt in advance (when generating guest device tree).
> > Actually this is really cool! Does it work? It doesn't matter to me if
> > the virtio devices are pci or mmio as long as we can solve the "wait"
> > problem. So this could be a good solution.
>
>
> ... yes, it does. Initially I experimented with virtio-mmio devices, but
> today I tried with PCI host bridge as well.
> I won't describe the commands which I used to apply/remove device-tree
> overlays from the userspace as well as the context of
> dtso files I created, I will describe how that could be done from the
> kernel by using existing functionality (CONFIG_OF_DYNAMIC).
>
> As I said if we exposed the devices with status "disabled", they
> wouldn't get probed by default. Once we receive an signal
> that otherend is ready, we change the status to "ok" and the
> corresponding device gets probed.
>
> So below the test patch, which just change the status of the required
> device-tree node (as you can see the code to update the property is
> simple enough),
> I hacked "xl sysrq" for the convenience of testing.
>
>
> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
> index 045c1805b2d5..9683ce075bc9 100644
> --- a/drivers/xen/grant-dma-ops.c
> +++ b/drivers/xen/grant-dma-ops.c
> @@ -10,6 +10,7 @@
> #include <linux/module.h>
> #include <linux/dma-map-ops.h>
> #include <linux/of.h>
> +#include <linux/of_platform.h>
> #include <linux/pci.h>
> #include <linux/pfn.h>
> #include <linux/xarray.h>
> @@ -379,6 +380,108 @@ bool xen_is_grant_dma_device(struct device *dev)
> return false;
> }
>
> +/* TODO: Consider using statically allocated (struct property status) */
> +static int xen_grant_dma_enable_device(struct device_node *np)
> +{
> + struct property *status;
> +
> + status = kzalloc(sizeof(*status), GFP_KERNEL);
> + if (!status)
> + return -ENOMEM;
> +
> + status->name = kstrdup("status", GFP_KERNEL);
> + if (!status->name)
> + return -ENOMEM;
> +
> + status->value = kstrdup("okay", GFP_KERNEL);
> + if (!status->value)
> + return -ENOMEM;
> +
> + status->length = sizeof("okay");
> +
> + return of_update_property(np, status);
> +}
> +
> +static int xen_grant_dma_disable_device(struct device_node *np)
> +{
> + struct property *status;
> +
> + status = kzalloc(sizeof(*status), GFP_KERNEL);
> + if (!status)
> + return -ENOMEM;
> +
> + status->name = kstrdup("status", GFP_KERNEL);
> + if (!status->name)
> + return -ENOMEM;
> +
> + status->value = kstrdup("disabled", GFP_KERNEL);
> + if (!status->value)
> + return -ENOMEM;
> +
> + status->length = sizeof("disabled");
> +
> + return of_update_property(np, status);
> +}
> +
> +void xen_grant_dma_handle_sysrq(int key)
> +{
> + struct device_node *np;
> + const char *path;
> + bool enable;
> +
> + printk("%s: got key %d\n", __func__, key);
> +
> + switch (key) {
> + case '0':
> + path = "/virtio@2000000";
> + enable = true;
> + break;
> +
> + case '1':
> + path = "/virtio@2000200";
> + enable = true;
> + break;
> +
> + case '2':
> + path = "/virtio@2000000";
> + enable = false;
> + break;
> +
> + case '3':
> + path = "/virtio@2000200";
> + enable = false;
> + break;
> +
> + case '4':
> + path = "/pcie@10000000";
> + enable = true;
> + break;
> +
> + case '5':
> + path = "/pcie@10000000";
> + enable = false;
> + break;
> +
> + default:
> + printk("%s: wrong key %d\n", __func__, key);
> + return;
> + }
> +
> + np = of_find_node_by_path(path);
> + if (!np) {
> + printk("%s: failed to find node by path %s\n", __func__,
> path);
> + return;
> + }
> +
> + if (enable) {
> + xen_grant_dma_enable_device(np);
> + printk("%s: enable %s\n", __func__, path);
> + } else {
> + xen_grant_dma_disable_device(np);
> + printk("%s: disable %s\n", __func__, path);
> + }
> +}
> +
> bool xen_virtio_mem_acc(struct virtio_device *dev)
> {
> if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain())
> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
> index c16df629907e..6df96be1ea40 100644
> --- a/drivers/xen/manage.c
> +++ b/drivers/xen/manage.c
> @@ -308,7 +308,8 @@ static void sysrq_handler(struct xenbus_watch
> *watch, const char *path,
> goto again;
>
> if (sysrq_key != '\0')
> - handle_sysrq(sysrq_key);
> + /*handle_sysrq(sysrq_key);*/
> + xen_grant_dma_handle_sysrq(sysrq_key);
> }
>
> static struct xenbus_watch sysrq_watch = {
> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
> index a34f4271a2e9..c2da1bc24091 100644
> --- a/include/xen/xen-ops.h
> +++ b/include/xen/xen-ops.h
> @@ -215,6 +215,8 @@ static inline void xen_preemptible_hcall_end(void) { }
>
> #endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
>
> +void xen_grant_dma_handle_sysrq(int key);
> +
> #ifdef CONFIG_XEN_GRANT_DMA_OPS
> void xen_grant_setup_dma_ops(struct device *dev);
> bool xen_is_grant_dma_device(struct device *dev);
> (END)
>
> So how it looks like:
>
> 1. DomU boots without PCI Host bridge probed. So nothing PCI related is
> observed in DomU.
>
> cat /proc/device-tree/pcie@10000000/status
> disabled
>
> 2. I run backends in DomD and after that issue a signal to "enable"
>
> root@generic-armv8-xt-dom0:~# xl sysrq DomU 4
>
> 3. The PCI Host bridge is probed, and all required PCI devices are
> discovered
>
> root@generic-armv8-xt-dom0:~# xl console DomU
> [ 237.407620] xen_grant_dma_handle_sysrq: got key 52
> [ 237.408133] pci-host-generic 10000000.pcie: host bridge
> /pcie@10000000 ranges:
> [ 237.408186] pci-host-generic 10000000.pcie: MEM
> 0x0023000000..0x0032ffffff -> 0x0023000000
> [ 237.408231] pci-host-generic 10000000.pcie: MEM
> 0x0100000000..0x01ffffffff -> 0x0100000000
> [ 237.408313] pci-host-generic 10000000.pcie: ECAM at [mem
> 0x10000000-0x1fffffff] for [bus 00-ff]
> [ 237.408451] pci-host-generic 10000000.pcie: PCI host bridge to bus
> 0000:00
> [ 237.408490] pci_bus 0000:00: root bus resource [bus 00-ff]
> [ 237.408517] pci_bus 0000:00: root bus resource [mem
> 0x23000000-0x32ffffff]
> [ 237.408545] pci_bus 0000:00: root bus resource [mem
> 0x100000000-0x1ffffffff pref]
> [ 237.409043] pci 0000:00:00.0: [1b36:0008] type 00 class 0x060000
> [ 237.458045] pci 0000:00:01.0: [1af4:1041] type 00 class 0x020000
> [ 237.502588] pci 0000:00:01.0: reg 0x20: [mem 0x00000000-0x00003fff
> 64bit pref]
> [ 237.507475] pci 0000:00:02.0: [1af4:1042] type 00 class 0x010000
> [ 237.552706] pci 0000:00:02.0: reg 0x20: [mem 0x00000000-0x00003fff
> 64bit pref]
> [ 237.559847] pci 0000:00:01.0: BAR 4: assigned [mem
> 0x100000000-0x100003fff 64bit pref]
> [ 237.560411] pci 0000:00:02.0: BAR 4: assigned [mem
> 0x100004000-0x100007fff 64bit pref]
> [ 237.563324] virtio-pci 0000:00:01.0: Set up Xen grant DMA ops (rid
> 0x8 sid 0x1)
> [ 237.564833] virtio-pci 0000:00:01.0: enabling device (0000 -> 0002)
> [ 237.582734] virtio-pci 0000:00:02.0: Set up Xen grant DMA ops (rid
> 0x10 sid 0x1)
> [ 237.583413] virtio-pci 0000:00:02.0: enabling device (0000 -> 0002)
> [ 237.595712] virtio_blk virtio1: 4/0/0 default/read/poll queues
> [ 237.596227] virtio_net virtio0 enp0s1: renamed from eth1
> [ 237.602499] virtio_blk virtio1: [vda] 4096000 512-byte logical blocks
> (2.10 GB/1.95 GiB)
> [ 237.606317] xen_grant_dma_handle_sysrq: enable /pcie@10000000
>
> 4. The same way the pseudo-hotremove would work (if we change the status
> to "disabled" the corresponding device gets removed)
>
>
> If this pseudo-hotplug sounds appropriate for the dom0less,
This is great! Yes I think it is totally acceptable.
> the one of the next questions would be what mechanism to use for
> signalling (event, xenstore, whatever).
For your information, we had to solve a similar issue a few months ago
to let a domU discover a newly added and directly assigned programmable
logic block. That was also done by applying DT overlays, first to Xen,
then to the domU.
Have a look at Vikram's Xen Summit presentation:
https://static.sched.com/hosted_files/xen2022/e8/Introduce%20Dynamic%20Device%20Node%20Programming%20for%20Xen.pdf
We wrote a small xenstore-based protocol to notify the domU and also to
tranfer the overlay to it:
https://github.com/Xilinx/xen/blob/xlnx_rebase_4.16/docs/misc/arm/overlay.txt
https://github.com/Xilinx/xen/blob/xlnx_rebase_4.16/tools/helpers/get_overlay.c
There is a good description starting at slide 16 in the PDF.
I am only sharing this as FYI. This Virtio problem is simpler because we
already know the devices that are going to become available. We don't
need an actual DT overlay to be passed to the domU. So we could get away
with just a single interrupt or a single xenstore property.
> Note that signal should only be sent if all backends which serve
> virtio-pci devices within that PCI Host bridge are ready.
Yes. That should be fine as long as all the backends are in the same
domain. I can imagine there could be difficulties if the backends are
in different domains: backend-domain-1 would have to tell dom0 that it
is ready, then backend-domain-2 would have to do the same, then dom0
finally notifies the domU, or something like that.
Anyway, I think this is good enough to start as a solution. Excellent!
> >>> Other ideas?
> >> Another (crazy?) idea is to reuse CONFIG_XEN_VIRTIO_FORCE_GRANT for
> >> dom0less system (I mean without "xen,grant-dma" bindings at all).
> >> If virtio backends are always going to run in Dom0 when we have it up
> >> and running, then it should work as domid == 0 is reserved for Dom0.
> >> If there is a need to run virtio backends in other *backend* domain (for
> >> the domain ID to be always known we could reserve an ID for it, so it
> >> would be a const value),
> >> we could probably introduce something configurable like
> >> CONFIG_XEN_VIRTIO_FORCE_GRANT_BE_DOMID with 0 by default (or cmd line
> >> option).
> > The problem in a dom0less system is not much how to tell which is the
> > backend domid, because that is known in advance and could be added to
> > device tree at boot somehow. The issue is how to ask the frontend to
> > "wait" and then how to tell the frontend to "proceed" after the backend
> > comes online.
>
> please see above.
>
>
> To summarize:
>
> 1. For normal case there is no problem with communicating the backend
> domid on Arm with device-tree (neither for virtio-mmio nor for virtio-pci),
> for the virtio-pci the V2 (PCI-IOMMU bindings) should be used. For the
> dom0less there won't be problem also as I understood from the discussion
> (as the configuration is known in advance).
> So I propose to concentrate on V2.
Yes I agree
> 2. The problem is in supporting virtio for the dom0less in general
> despite whether it is a foreign or grant mappings.
> Here we would need a (pseudo-)hotplug or some other method to start
> operating only when backend is available.
Yes I think you are right
On 19.10.22 23:38, Stefano Stabellini wrote:
Hello Stefano
> + Vikram
>
> On Wed, 19 Oct 2022, Oleksandr Tyshchenko wrote:
>>>> Regarding the virtio-mmio (platform) devices, yes, we could expose them
>>>> with status "disabled", and they won't get probed by default.
>>>> To be honest, I have experimented with that, when I was thinking of
>>>> possible hotplug for virtio-mmio devices (I know, this sounds uncommon
>>>> and strange).
>>>> I used Linux feature (CONFIG_OF_DYNAMIC, overlays) to update the
>>>> device-tree on running guest, so the toolstack initially inserts
>>>> virtio-mmio device nodes for non-boot devices
>>>> with status "disabled", and at the runtime, once we receive an event for
>>>> example, we change the status to "ok" and the corresponding virtio-mmio
>>>> device gets probed.
>>>> But again, it is not a 100% hotplug, as we need to pre-allocate memory
>>>> range and interrupt in advance (when generating guest device tree).
>>> Actually this is really cool! Does it work? It doesn't matter to me if
>>> the virtio devices are pci or mmio as long as we can solve the "wait"
>>> problem. So this could be a good solution.
>>
>> ... yes, it does. Initially I experimented with virtio-mmio devices, but
>> today I tried with PCI host bridge as well.
>> I won't describe the commands which I used to apply/remove device-tree
>> overlays from the userspace as well as the context of
>> dtso files I created, I will describe how that could be done from the
>> kernel by using existing functionality (CONFIG_OF_DYNAMIC).
>>
>> As I said if we exposed the devices with status "disabled", they
>> wouldn't get probed by default. Once we receive an signal
>> that otherend is ready, we change the status to "ok" and the
>> corresponding device gets probed.
>>
>> So below the test patch, which just change the status of the required
>> device-tree node (as you can see the code to update the property is
>> simple enough),
>> I hacked "xl sysrq" for the convenience of testing.
>>
>>
>> diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
>> index 045c1805b2d5..9683ce075bc9 100644
>> --- a/drivers/xen/grant-dma-ops.c
>> +++ b/drivers/xen/grant-dma-ops.c
>> @@ -10,6 +10,7 @@
>> #include <linux/module.h>
>> #include <linux/dma-map-ops.h>
>> #include <linux/of.h>
>> +#include <linux/of_platform.h>
>> #include <linux/pci.h>
>> #include <linux/pfn.h>
>> #include <linux/xarray.h>
>> @@ -379,6 +380,108 @@ bool xen_is_grant_dma_device(struct device *dev)
>> return false;
>> }
>>
>> +/* TODO: Consider using statically allocated (struct property status) */
>> +static int xen_grant_dma_enable_device(struct device_node *np)
>> +{
>> + struct property *status;
>> +
>> + status = kzalloc(sizeof(*status), GFP_KERNEL);
>> + if (!status)
>> + return -ENOMEM;
>> +
>> + status->name = kstrdup("status", GFP_KERNEL);
>> + if (!status->name)
>> + return -ENOMEM;
>> +
>> + status->value = kstrdup("okay", GFP_KERNEL);
>> + if (!status->value)
>> + return -ENOMEM;
>> +
>> + status->length = sizeof("okay");
>> +
>> + return of_update_property(np, status);
>> +}
>> +
>> +static int xen_grant_dma_disable_device(struct device_node *np)
>> +{
>> + struct property *status;
>> +
>> + status = kzalloc(sizeof(*status), GFP_KERNEL);
>> + if (!status)
>> + return -ENOMEM;
>> +
>> + status->name = kstrdup("status", GFP_KERNEL);
>> + if (!status->name)
>> + return -ENOMEM;
>> +
>> + status->value = kstrdup("disabled", GFP_KERNEL);
>> + if (!status->value)
>> + return -ENOMEM;
>> +
>> + status->length = sizeof("disabled");
>> +
>> + return of_update_property(np, status);
>> +}
>> +
>> +void xen_grant_dma_handle_sysrq(int key)
>> +{
>> + struct device_node *np;
>> + const char *path;
>> + bool enable;
>> +
>> + printk("%s: got key %d\n", __func__, key);
>> +
>> + switch (key) {
>> + case '0':
>> + path = "/virtio@2000000";
>> + enable = true;
>> + break;
>> +
>> + case '1':
>> + path = "/virtio@2000200";
>> + enable = true;
>> + break;
>> +
>> + case '2':
>> + path = "/virtio@2000000";
>> + enable = false;
>> + break;
>> +
>> + case '3':
>> + path = "/virtio@2000200";
>> + enable = false;
>> + break;
>> +
>> + case '4':
>> + path = "/pcie@10000000";
>> + enable = true;
>> + break;
>> +
>> + case '5':
>> + path = "/pcie@10000000";
>> + enable = false;
>> + break;
>> +
>> + default:
>> + printk("%s: wrong key %d\n", __func__, key);
>> + return;
>> + }
>> +
>> + np = of_find_node_by_path(path);
>> + if (!np) {
>> + printk("%s: failed to find node by path %s\n", __func__,
>> path);
>> + return;
>> + }
>> +
>> + if (enable) {
>> + xen_grant_dma_enable_device(np);
>> + printk("%s: enable %s\n", __func__, path);
>> + } else {
>> + xen_grant_dma_disable_device(np);
>> + printk("%s: disable %s\n", __func__, path);
>> + }
>> +}
>> +
>> bool xen_virtio_mem_acc(struct virtio_device *dev)
>> {
>> if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT) || xen_pv_domain())
>> diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
>> index c16df629907e..6df96be1ea40 100644
>> --- a/drivers/xen/manage.c
>> +++ b/drivers/xen/manage.c
>> @@ -308,7 +308,8 @@ static void sysrq_handler(struct xenbus_watch
>> *watch, const char *path,
>> goto again;
>>
>> if (sysrq_key != '\0')
>> - handle_sysrq(sysrq_key);
>> + /*handle_sysrq(sysrq_key);*/
>> + xen_grant_dma_handle_sysrq(sysrq_key);
>> }
>>
>> static struct xenbus_watch sysrq_watch = {
>> diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
>> index a34f4271a2e9..c2da1bc24091 100644
>> --- a/include/xen/xen-ops.h
>> +++ b/include/xen/xen-ops.h
>> @@ -215,6 +215,8 @@ static inline void xen_preemptible_hcall_end(void) { }
>>
>> #endif /* CONFIG_XEN_PV && !CONFIG_PREEMPTION */
>>
>> +void xen_grant_dma_handle_sysrq(int key);
>> +
>> #ifdef CONFIG_XEN_GRANT_DMA_OPS
>> void xen_grant_setup_dma_ops(struct device *dev);
>> bool xen_is_grant_dma_device(struct device *dev);
>> (END)
>>
>> So how it looks like:
>>
>> 1. DomU boots without PCI Host bridge probed. So nothing PCI related is
>> observed in DomU.
>>
>> cat /proc/device-tree/pcie@10000000/status
>> disabled
>>
>> 2. I run backends in DomD and after that issue a signal to "enable"
>>
>> root@generic-armv8-xt-dom0:~# xl sysrq DomU 4
>>
>> 3. The PCI Host bridge is probed, and all required PCI devices are
>> discovered
>>
>> root@generic-armv8-xt-dom0:~# xl console DomU
>> [ 237.407620] xen_grant_dma_handle_sysrq: got key 52
>> [ 237.408133] pci-host-generic 10000000.pcie: host bridge
>> /pcie@10000000 ranges:
>> [ 237.408186] pci-host-generic 10000000.pcie: MEM
>> 0x0023000000..0x0032ffffff -> 0x0023000000
>> [ 237.408231] pci-host-generic 10000000.pcie: MEM
>> 0x0100000000..0x01ffffffff -> 0x0100000000
>> [ 237.408313] pci-host-generic 10000000.pcie: ECAM at [mem
>> 0x10000000-0x1fffffff] for [bus 00-ff]
>> [ 237.408451] pci-host-generic 10000000.pcie: PCI host bridge to bus
>> 0000:00
>> [ 237.408490] pci_bus 0000:00: root bus resource [bus 00-ff]
>> [ 237.408517] pci_bus 0000:00: root bus resource [mem
>> 0x23000000-0x32ffffff]
>> [ 237.408545] pci_bus 0000:00: root bus resource [mem
>> 0x100000000-0x1ffffffff pref]
>> [ 237.409043] pci 0000:00:00.0: [1b36:0008] type 00 class 0x060000
>> [ 237.458045] pci 0000:00:01.0: [1af4:1041] type 00 class 0x020000
>> [ 237.502588] pci 0000:00:01.0: reg 0x20: [mem 0x00000000-0x00003fff
>> 64bit pref]
>> [ 237.507475] pci 0000:00:02.0: [1af4:1042] type 00 class 0x010000
>> [ 237.552706] pci 0000:00:02.0: reg 0x20: [mem 0x00000000-0x00003fff
>> 64bit pref]
>> [ 237.559847] pci 0000:00:01.0: BAR 4: assigned [mem
>> 0x100000000-0x100003fff 64bit pref]
>> [ 237.560411] pci 0000:00:02.0: BAR 4: assigned [mem
>> 0x100004000-0x100007fff 64bit pref]
>> [ 237.563324] virtio-pci 0000:00:01.0: Set up Xen grant DMA ops (rid
>> 0x8 sid 0x1)
>> [ 237.564833] virtio-pci 0000:00:01.0: enabling device (0000 -> 0002)
>> [ 237.582734] virtio-pci 0000:00:02.0: Set up Xen grant DMA ops (rid
>> 0x10 sid 0x1)
>> [ 237.583413] virtio-pci 0000:00:02.0: enabling device (0000 -> 0002)
>> [ 237.595712] virtio_blk virtio1: 4/0/0 default/read/poll queues
>> [ 237.596227] virtio_net virtio0 enp0s1: renamed from eth1
>> [ 237.602499] virtio_blk virtio1: [vda] 4096000 512-byte logical blocks
>> (2.10 GB/1.95 GiB)
>> [ 237.606317] xen_grant_dma_handle_sysrq: enable /pcie@10000000
>>
>> 4. The same way the pseudo-hotremove would work (if we change the status
>> to "disabled" the corresponding device gets removed)
>>
>>
>> If this pseudo-hotplug sounds appropriate for the dom0less,
> This is great! Yes I think it is totally acceptable.
Perfect!
>
>
>> the one of the next questions would be what mechanism to use for
>> signalling (event, xenstore, whatever).
> For your information, we had to solve a similar issue a few months ago
> to let a domU discover a newly added and directly assigned programmable
> logic block. That was also done by applying DT overlays, first to Xen,
> then to the domU.
>
> Have a look at Vikram's Xen Summit presentation:
> https://urldefense.com/v3/__https://static.sched.com/hosted_files/xen2022/e8/Introduce*20Dynamic*20Device*20Node*20Programming*20for*20Xen.pdf__;JSUlJSUl!!GF_29dbcQIUBPA!yTJGjmmfrdJfoc-magnbk6EcuNspXxuQyDMzItgaV9mehJMj37w7Go_O9tDCQV3Qpij0PORZdiZsjBYROsaRNWVjPNUucg$ [static[.]sched[.]com]
>
> We wrote a small xenstore-based protocol to notify the domU and also to
> tranfer the overlay to it:
>
> https://urldefense.com/v3/__https://github.com/Xilinx/xen/blob/xlnx_rebase_4.16/docs/misc/arm/overlay.txt__;!!GF_29dbcQIUBPA!yTJGjmmfrdJfoc-magnbk6EcuNspXxuQyDMzItgaV9mehJMj37w7Go_O9tDCQV3Qpij0PORZdiZsjBYROsaRNWXs7xlouA$ [github[.]com]
> https://urldefense.com/v3/__https://github.com/Xilinx/xen/blob/xlnx_rebase_4.16/tools/helpers/get_overlay.c__;!!GF_29dbcQIUBPA!yTJGjmmfrdJfoc-magnbk6EcuNspXxuQyDMzItgaV9mehJMj37w7Go_O9tDCQV3Qpij0PORZdiZsjBYROsaRNWXoR3gEjw$ [github[.]com]
>
> There is a good description starting at slide 16 in the PDF.
>
>
> I am only sharing this as FYI.
Very interesting materials and impressive work! I can't even imagine
what it took to get it working.
> This Virtio problem is simpler because we
> already know the devices that are going to become available. We don't
> need an actual DT overlay to be passed to the domU. So we could get away
> with just a single interrupt or a single xenstore property.
I completely agree that for virtio it is going to be simpler than for FPGA.
>
>
>> Note that signal should only be sent if all backends which serve
>> virtio-pci devices within that PCI Host bridge are ready.
> Yes. That should be fine as long as all the backends are in the same
> domain. I can imagine there could be difficulties if the backends are
> in different domains: backend-domain-1 would have to tell dom0 that it
> is ready, then backend-domain-2 would have to do the same, then dom0
> finally notifies the domU, or something like that.
>
> Anyway, I think this is good enough to start as a solution. Excellent!
>
>
>>>>> Other ideas?
>>>> Another (crazy?) idea is to reuse CONFIG_XEN_VIRTIO_FORCE_GRANT for
>>>> dom0less system (I mean without "xen,grant-dma" bindings at all).
>>>> If virtio backends are always going to run in Dom0 when we have it up
>>>> and running, then it should work as domid == 0 is reserved for Dom0.
>>>> If there is a need to run virtio backends in other *backend* domain (for
>>>> the domain ID to be always known we could reserve an ID for it, so it
>>>> would be a const value),
>>>> we could probably introduce something configurable like
>>>> CONFIG_XEN_VIRTIO_FORCE_GRANT_BE_DOMID with 0 by default (or cmd line
>>>> option).
>>> The problem in a dom0less system is not much how to tell which is the
>>> backend domid, because that is known in advance and could be added to
>>> device tree at boot somehow. The issue is how to ask the frontend to
>>> "wait" and then how to tell the frontend to "proceed" after the backend
>>> comes online.
>> please see above.
>>
>>
>> To summarize:
>>
>> 1. For normal case there is no problem with communicating the backend
>> domid on Arm with device-tree (neither for virtio-mmio nor for virtio-pci),
>> for the virtio-pci the V2 (PCI-IOMMU bindings) should be used. For the
>> dom0less there won't be problem also as I understood from the discussion
>> (as the configuration is known in advance).
>> So I propose to concentrate on V2.
> Yes I agree
>
>
>> 2. The problem is in supporting virtio for the dom0less in general
>> despite whether it is a foreign or grant mappings.
>> Here we would need a (pseudo-)hotplug or some other method to start
>> operating only when backend is available.
> Yes I think you are right
--
Regards,
Oleksandr Tyshchenko