The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and
VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability
structs:
+------+---------+---------+-----+
| info | caps[0] | caps[1] | ... |
+------+---------+---------+-----+
Both the info and capability struct sizes are not always multiples of
sizeof(u64), leaving u64 fields in later capability structs misaligned.
Userspace applications currently need to handle misalignment manually in
order to support CPU architectures and programming languages with strict
alignment requirements.
Make life easier for userspace by ensuring alignment in the kernel.
The new layout is as follows:
+------+---+---------+---------+---+-----+
| info | 0 | caps[0] | caps[1] | 0 | ... |
+------+---+---------+---------+---+-----+
In this example info and caps[1] have sizes that are not multiples of
sizeof(u64), so zero padding is added to align the subsequent structure.
Adding zero padding between structs does not break the uapi. The memory
layout is specified by the info.cap_offset and caps[i].next fields
filled in by the kernel. Applications use these field values to locate
structs and are therefore unaffected by the addition of zero padding.
Signed-off-by: Stefan Hajnoczi <[email protected]>
---
include/linux/vfio.h | 2 +-
drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++--
drivers/s390/cio/vfio_ccw_ops.c | 7 +++--
drivers/vfio/pci/vfio_pci_core.c | 14 ++++++---
drivers/vfio/vfio_iommu_type1.c | 7 +++--
drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++-----
6 files changed, 71 insertions(+), 19 deletions(-)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 2c137ea94a3e..ff0864e73cc3 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -272,7 +272,7 @@ struct vfio_info_cap {
struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
size_t size, u16 id,
u16 version);
-void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
+ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
int vfio_info_add_capability(struct vfio_info_cap *caps,
struct vfio_info_cap_header *cap, size_t size);
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index de675d799c7d..9060e9c6ac7c 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
info.argsz = sizeof(info) + caps.size;
info.cap_offset = 0;
} else {
- vfio_info_cap_shift(&caps, sizeof(info));
+ ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
+ if (cap_offset < 0)
+ return cap_offset;
+
if (copy_to_user((void __user *)arg +
sizeof(info), caps.buf,
caps.size)) {
@@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
kfree(sparse);
return -EFAULT;
}
- info.cap_offset = sizeof(info);
+ info.cap_offset = cap_offset;
}
kfree(caps.buf);
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 5b53b94f13c7..63d5163376a5 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
info->argsz = sizeof(*info) + caps.size;
info->cap_offset = 0;
} else {
- vfio_info_cap_shift(&caps, sizeof(*info));
+ ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info));
+ if (cap_offset < 0)
+ return cap_offset;
+
if (copy_to_user((void __user *)arg + sizeof(*info),
caps.buf, caps.size)) {
kfree(caps.buf);
return -EFAULT;
}
- info->cap_offset = sizeof(*info);
+ info->cap_offset = cap_offset;
}
kfree(caps.buf);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 20d7b69ea6ff..92c093b99187 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
if (info.argsz < sizeof(info) + caps.size) {
info.argsz = sizeof(info) + caps.size;
} else {
- vfio_info_cap_shift(&caps, sizeof(info));
+ ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
+ if (cap_offset < 0)
+ return cap_offset;
+
if (copy_to_user(arg + 1, caps.buf, caps.size)) {
kfree(caps.buf);
return -EFAULT;
}
- info.cap_offset = sizeof(*arg);
+ info.cap_offset = cap_offset;
}
kfree(caps.buf);
@@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
info.argsz = sizeof(info) + caps.size;
info.cap_offset = 0;
} else {
- vfio_info_cap_shift(&caps, sizeof(info));
+ ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
+ if (cap_offset < 0)
+ return cap_offset;
+
if (copy_to_user(arg + 1, caps.buf, caps.size)) {
kfree(caps.buf);
return -EFAULT;
}
- info.cap_offset = sizeof(*arg);
+ info.cap_offset = cap_offset;
}
kfree(caps.buf);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ebe0ad31d0b0..ab64b9e3ed7c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
if (info.argsz < sizeof(info) + caps.size) {
info.argsz = sizeof(info) + caps.size;
} else {
- vfio_info_cap_shift(&caps, sizeof(info));
+ ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
+ if (cap_offset < 0)
+ return cap_offset;
+
if (copy_to_user((void __user *)arg +
sizeof(info), caps.buf,
caps.size)) {
kfree(caps.buf);
return -EFAULT;
}
- info.cap_offset = sizeof(info);
+ info.cap_offset = cap_offset;
}
kfree(caps.buf);
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index f0ca33b2e1df..4fc8698577a7 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
{
void *buf;
struct vfio_info_cap_header *header, *tmp;
+ size_t header_offset;
+ size_t new_size;
- buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
+ /*
+ * Reserve extra space when the previous capability was not a multiple of
+ * the largest field size. This ensures that capabilities are properly
+ * aligned.
+ */
+ header_offset = ALIGN(caps->size, sizeof(u64));
+ new_size = header_offset + size;
+
+ buf = krealloc(caps->buf, new_size, GFP_KERNEL);
if (!buf) {
kfree(caps->buf);
caps->buf = NULL;
@@ -1181,10 +1191,10 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
}
caps->buf = buf;
- header = buf + caps->size;
+ header = buf + header_offset;
/* Eventually copied to user buffer, zero */
- memset(header, 0, size);
+ memset(buf + caps->size, 0, new_size - caps->size);
header->id = id;
header->version = version;
@@ -1193,20 +1203,47 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
for (tmp = buf; tmp->next; tmp = buf + tmp->next)
; /* nothing */
- tmp->next = caps->size;
- caps->size += size;
+ tmp->next = header_offset;
+ caps->size = new_size;
return header;
}
EXPORT_SYMBOL_GPL(vfio_info_cap_add);
-void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
+/*
+ * Adjust the capability next fields to account for the given offset at which
+ * capability structures start and any padding added for alignment. Returns the
+ * cap_offset or -errno.
+ */
+ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
{
struct vfio_info_cap_header *tmp;
+ struct vfio_info_cap_header *next_tmp;
void *buf = (void *)caps->buf;
+ size_t pad = ALIGN(offset, sizeof(u64)) - offset;
+ size_t cap_offset = offset + pad;
- for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
- tmp->next += offset;
+ /* Shift the next fields to account for offset and pad */
+ for (tmp = buf; tmp->next; tmp = next_tmp) {
+ next_tmp = buf + tmp->next;
+ tmp->next += cap_offset;
+ }
+
+ /* Pad with zeroes so capabilities start with proper alignment */
+ buf = krealloc(caps->buf, caps->size + pad, GFP_KERNEL);
+ if (!buf) {
+ kfree(caps->buf);
+ caps->buf = NULL;
+ caps->size = 0;
+ return -ENOMEM;
+ }
+
+ memmove(buf + pad, buf, caps->size);
+ memset(buf, 0, pad);
+
+ caps->buf = buf;
+ caps->size += pad;
+ return cap_offset;
}
EXPORT_SYMBOL(vfio_info_cap_shift);
--
2.41.0
On Thu, 3 Aug 2023 10:41:09 -0400
Stefan Hajnoczi <[email protected]> wrote:
> The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and
> VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability
> structs:
>
> +------+---------+---------+-----+
> | info | caps[0] | caps[1] | ... |
> +------+---------+---------+-----+
>
> Both the info and capability struct sizes are not always multiples of
> sizeof(u64), leaving u64 fields in later capability structs misaligned.
>
> Userspace applications currently need to handle misalignment manually in
> order to support CPU architectures and programming languages with strict
> alignment requirements.
>
> Make life easier for userspace by ensuring alignment in the kernel.
> The new layout is as follows:
>
> +------+---+---------+---------+---+-----+
> | info | 0 | caps[0] | caps[1] | 0 | ... |
> +------+---+---------+---------+---+-----+
>
> In this example info and caps[1] have sizes that are not multiples of
> sizeof(u64), so zero padding is added to align the subsequent structure.
>
> Adding zero padding between structs does not break the uapi. The memory
> layout is specified by the info.cap_offset and caps[i].next fields
> filled in by the kernel. Applications use these field values to locate
> structs and are therefore unaffected by the addition of zero padding.
>
> Signed-off-by: Stefan Hajnoczi <[email protected]>
> ---
> include/linux/vfio.h | 2 +-
> drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++--
> drivers/s390/cio/vfio_ccw_ops.c | 7 +++--
> drivers/vfio/pci/vfio_pci_core.c | 14 ++++++---
> drivers/vfio/vfio_iommu_type1.c | 7 +++--
> drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++-----
> 6 files changed, 71 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 2c137ea94a3e..ff0864e73cc3 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -272,7 +272,7 @@ struct vfio_info_cap {
> struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> size_t size, u16 id,
> u16 version);
> -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
>
> int vfio_info_add_capability(struct vfio_info_cap *caps,
> struct vfio_info_cap_header *cap, size_t size);
> diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> index de675d799c7d..9060e9c6ac7c 100644
> --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> @@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> info.argsz = sizeof(info) + caps.size;
> info.cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg +
> sizeof(info), caps.buf,
> caps.size)) {
> @@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> kfree(sparse);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(info);
> + info.cap_offset = cap_offset;
The copy_to_user() above needs to be modified to make this true:
copy_to_user((void __user *)arg + cap_offset,...
Same for all similar below.
> }
>
> kfree(caps.buf);
> diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> index 5b53b94f13c7..63d5163376a5 100644
> --- a/drivers/s390/cio/vfio_ccw_ops.c
> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> @@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
> info->argsz = sizeof(*info) + caps.size;
> info->cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(*info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg + sizeof(*info),
> caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info->cap_offset = sizeof(*info);
> + info->cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 20d7b69ea6ff..92c093b99187 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
> if (info.argsz < sizeof(info) + caps.size) {
> info.argsz = sizeof(info) + caps.size;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(*arg);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> @@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
> info.argsz = sizeof(info) + caps.size;
> info.cap_offset = 0;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(*arg);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index ebe0ad31d0b0..ab64b9e3ed7c 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
> if (info.argsz < sizeof(info) + caps.size) {
> info.argsz = sizeof(info) + caps.size;
> } else {
> - vfio_info_cap_shift(&caps, sizeof(info));
> + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> + if (cap_offset < 0)
> + return cap_offset;
> +
> if (copy_to_user((void __user *)arg +
> sizeof(info), caps.buf,
> caps.size)) {
> kfree(caps.buf);
> return -EFAULT;
> }
> - info.cap_offset = sizeof(info);
> + info.cap_offset = cap_offset;
> }
>
> kfree(caps.buf);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index f0ca33b2e1df..4fc8698577a7 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> {
> void *buf;
> struct vfio_info_cap_header *header, *tmp;
> + size_t header_offset;
> + size_t new_size;
>
> - buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> + /*
> + * Reserve extra space when the previous capability was not a multiple of
> + * the largest field size. This ensures that capabilities are properly
> + * aligned.
> + */
If we simply start with:
size = ALIGN(size, sizeof(u64));
then shouldn't there never be a previous misaligned size to correct?
I wonder if we really need all this complexity, we're drawing from a
finite set of info structs for the initial alignment, we can pad those
without breaking the uapi and we can introduce a warning to avoid such
poor alignment in the future. Allocating an aligned size for each
capability is then sufficiently trivial to handle runtime. ex:
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 902f06e52c48..2d074cbd371d 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
void *buf;
struct vfio_info_cap_header *header, *tmp;
+ size = ALIGN(size, sizeof(u64));
+
buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
if (!buf) {
kfree(caps->buf);
@@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
struct vfio_info_cap_header *tmp;
void *buf = (void *)caps->buf;
+ WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
+
for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
tmp->next += offset;
}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fa06e3eb4955..fd2761841ffe 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -217,6 +217,7 @@ struct vfio_device_info {
__u32 num_regions; /* Max region index + 1 */
__u32 num_irqs; /* Max IRQ index + 1 */
__u32 cap_offset; /* Offset within info struct of first cap */
+ __u32 pad; /* Size must be aligned for caps */
};
#define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
@@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
__u64 iova_pgsizes; /* Bitmap of supported page sizes */
__u32 cap_offset; /* Offset within info struct of first cap */
+ __u32 pad; /* Size must be aligned for caps */
};
/*
Thanks,
Alex
> + header_offset = ALIGN(caps->size, sizeof(u64));
> + new_size = header_offset + size;
> +
> + buf = krealloc(caps->buf, new_size, GFP_KERNEL);
> if (!buf) {
> kfree(caps->buf);
> caps->buf = NULL;
> @@ -1181,10 +1191,10 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> }
>
> caps->buf = buf;
> - header = buf + caps->size;
> + header = buf + header_offset;
>
> /* Eventually copied to user buffer, zero */
> - memset(header, 0, size);
> + memset(buf + caps->size, 0, new_size - caps->size);
>
> header->id = id;
> header->version = version;
> @@ -1193,20 +1203,47 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> for (tmp = buf; tmp->next; tmp = buf + tmp->next)
> ; /* nothing */
>
> - tmp->next = caps->size;
> - caps->size += size;
> + tmp->next = header_offset;
> + caps->size = new_size;
>
> return header;
> }
> EXPORT_SYMBOL_GPL(vfio_info_cap_add);
>
> -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> +/*
> + * Adjust the capability next fields to account for the given offset at which
> + * capability structures start and any padding added for alignment. Returns the
> + * cap_offset or -errno.
> + */
> +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> {
> struct vfio_info_cap_header *tmp;
> + struct vfio_info_cap_header *next_tmp;
> void *buf = (void *)caps->buf;
> + size_t pad = ALIGN(offset, sizeof(u64)) - offset;
> + size_t cap_offset = offset + pad;
>
> - for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> - tmp->next += offset;
> + /* Shift the next fields to account for offset and pad */
> + for (tmp = buf; tmp->next; tmp = next_tmp) {
> + next_tmp = buf + tmp->next;
> + tmp->next += cap_offset;
> + }
> +
> + /* Pad with zeroes so capabilities start with proper alignment */
> + buf = krealloc(caps->buf, caps->size + pad, GFP_KERNEL);
> + if (!buf) {
> + kfree(caps->buf);
> + caps->buf = NULL;
> + caps->size = 0;
> + return -ENOMEM;
> + }
> +
> + memmove(buf + pad, buf, caps->size);
> + memset(buf, 0, pad);
> +
> + caps->buf = buf;
> + caps->size += pad;
> + return cap_offset;
> }
> EXPORT_SYMBOL(vfio_info_cap_shift);
>
On Thu, Aug 03, 2023 at 03:18:23PM -0600, Alex Williamson wrote:
> On Thu, 3 Aug 2023 10:41:09 -0400
> Stefan Hajnoczi <[email protected]> wrote:
>
> > The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and
> > VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability
> > structs:
> >
> > +------+---------+---------+-----+
> > | info | caps[0] | caps[1] | ... |
> > +------+---------+---------+-----+
> >
> > Both the info and capability struct sizes are not always multiples of
> > sizeof(u64), leaving u64 fields in later capability structs misaligned.
> >
> > Userspace applications currently need to handle misalignment manually in
> > order to support CPU architectures and programming languages with strict
> > alignment requirements.
> >
> > Make life easier for userspace by ensuring alignment in the kernel.
> > The new layout is as follows:
> >
> > +------+---+---------+---------+---+-----+
> > | info | 0 | caps[0] | caps[1] | 0 | ... |
> > +------+---+---------+---------+---+-----+
> >
> > In this example info and caps[1] have sizes that are not multiples of
> > sizeof(u64), so zero padding is added to align the subsequent structure.
> >
> > Adding zero padding between structs does not break the uapi. The memory
> > layout is specified by the info.cap_offset and caps[i].next fields
> > filled in by the kernel. Applications use these field values to locate
> > structs and are therefore unaffected by the addition of zero padding.
> >
> > Signed-off-by: Stefan Hajnoczi <[email protected]>
> > ---
> > include/linux/vfio.h | 2 +-
> > drivers/gpu/drm/i915/gvt/kvmgt.c | 7 +++--
> > drivers/s390/cio/vfio_ccw_ops.c | 7 +++--
> > drivers/vfio/pci/vfio_pci_core.c | 14 ++++++---
> > drivers/vfio/vfio_iommu_type1.c | 7 +++--
> > drivers/vfio/vfio_main.c | 53 +++++++++++++++++++++++++++-----
> > 6 files changed, 71 insertions(+), 19 deletions(-)
> >
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > index 2c137ea94a3e..ff0864e73cc3 100644
> > --- a/include/linux/vfio.h
> > +++ b/include/linux/vfio.h
> > @@ -272,7 +272,7 @@ struct vfio_info_cap {
> > struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> > size_t size, u16 id,
> > u16 version);
> > -void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> > +ssize_t vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
> >
> > int vfio_info_add_capability(struct vfio_info_cap *caps,
> > struct vfio_info_cap_header *cap, size_t size);
> > diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > index de675d799c7d..9060e9c6ac7c 100644
> > --- a/drivers/gpu/drm/i915/gvt/kvmgt.c
> > +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
> > @@ -1297,7 +1297,10 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> > info.argsz = sizeof(info) + caps.size;
> > info.cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg +
> > sizeof(info), caps.buf,
> > caps.size)) {
> > @@ -1305,7 +1308,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
> > kfree(sparse);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(info);
> > + info.cap_offset = cap_offset;
>
> The copy_to_user() above needs to be modified to make this true:
>
> copy_to_user((void __user *)arg + cap_offset,...
>
> Same for all similar below.
vfio_info_cap_shift() inserts zero padding before the first capability
in order to achieve alignment. The zero padding is part of caps.buf, not
the info struct. Therefore the copy_to_user((void __user *)arg +
sizeof(info), ...) is correct.
It's confusing though. Your suggestion for simplifying alignment below
looks good and will avoid this issue.
>
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> > index 5b53b94f13c7..63d5163376a5 100644
> > --- a/drivers/s390/cio/vfio_ccw_ops.c
> > +++ b/drivers/s390/cio/vfio_ccw_ops.c
> > @@ -361,13 +361,16 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private,
> > info->argsz = sizeof(*info) + caps.size;
> > info->cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(*info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(*info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg + sizeof(*info),
> > caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info->cap_offset = sizeof(*info);
> > + info->cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 20d7b69ea6ff..92c093b99187 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -966,12 +966,15 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
> > if (info.argsz < sizeof(info) + caps.size) {
> > info.argsz = sizeof(info) + caps.size;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(*arg);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > @@ -1107,12 +1110,15 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
> > info.argsz = sizeof(info) + caps.size;
> > info.cap_offset = 0;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user(arg + 1, caps.buf, caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(*arg);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index ebe0ad31d0b0..ab64b9e3ed7c 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2808,14 +2808,17 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
> > if (info.argsz < sizeof(info) + caps.size) {
> > info.argsz = sizeof(info) + caps.size;
> > } else {
> > - vfio_info_cap_shift(&caps, sizeof(info));
> > + ssize_t cap_offset = vfio_info_cap_shift(&caps, sizeof(info));
> > + if (cap_offset < 0)
> > + return cap_offset;
> > +
> > if (copy_to_user((void __user *)arg +
> > sizeof(info), caps.buf,
> > caps.size)) {
> > kfree(caps.buf);
> > return -EFAULT;
> > }
> > - info.cap_offset = sizeof(info);
> > + info.cap_offset = cap_offset;
> > }
> >
> > kfree(caps.buf);
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index f0ca33b2e1df..4fc8698577a7 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -1171,8 +1171,18 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> > {
> > void *buf;
> > struct vfio_info_cap_header *header, *tmp;
> > + size_t header_offset;
> > + size_t new_size;
> >
> > - buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> > + /*
> > + * Reserve extra space when the previous capability was not a multiple of
> > + * the largest field size. This ensures that capabilities are properly
> > + * aligned.
> > + */
>
> If we simply start with:
>
> size = ALIGN(size, sizeof(u64));
>
> then shouldn't there never be a previous misaligned size to correct?
Yes, I applied padding at the beginning but it could be applied at the
end instead.
> I wonder if we really need all this complexity, we're drawing from a
> finite set of info structs for the initial alignment, we can pad those
> without breaking the uapi and we can introduce a warning to avoid such
> poor alignment in the future. Allocating an aligned size for each
> capability is then sufficiently trivial to handle runtime. ex:
>
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 902f06e52c48..2d074cbd371d 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> void *buf;
> struct vfio_info_cap_header *header, *tmp;
>
> + size = ALIGN(size, sizeof(u64));
> +
> buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> if (!buf) {
> kfree(caps->buf);
> @@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> struct vfio_info_cap_header *tmp;
> void *buf = (void *)caps->buf;
>
> + WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
> +
> for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> tmp->next += offset;
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fa06e3eb4955..fd2761841ffe 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -217,6 +217,7 @@ struct vfio_device_info {
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
> #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
>
> @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
> #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
> __u64 iova_pgsizes; /* Bitmap of supported page sizes */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
>
> /*
Yes, that's much simpler. I'll send another revision that follows that
approach.
Stefan
On Thu, Aug 03, 2023 at 03:18:23PM -0600, Alex Williamson wrote:
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 902f06e52c48..2d074cbd371d 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> void *buf;
> struct vfio_info_cap_header *header, *tmp;
>
> + size = ALIGN(size, sizeof(u64));
> +
> buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> if (!buf) {
> kfree(caps->buf);
> @@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> struct vfio_info_cap_header *tmp;
> void *buf = (void *)caps->buf;
>
> + WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
> +
> for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> tmp->next += offset;
> }
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index fa06e3eb4955..fd2761841ffe 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -217,6 +217,7 @@ struct vfio_device_info {
> __u32 num_regions; /* Max region index + 1 */
> __u32 num_irqs; /* Max IRQ index + 1 */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
> #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
>
> @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
> #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
> __u64 iova_pgsizes; /* Bitmap of supported page sizes */
> __u32 cap_offset; /* Offset within info struct of first cap */
> + __u32 pad; /* Size must be aligned for caps */
> };
IMHO this is partially being caused by not using __aligned_u64 for the
other __u64's in the same struct..
Both of these structs have u64s in them and many arches will
automatically add the above padding. __aligned_u64 will force the
reset to do it, and then making padding explicit as you have done will
make it really true.
This is a subtle x64/x32 compatability issue also. It is probably best
just to do the change across the whole header file.
Please also include the matching hunk for iommufd:
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -483,6 +483,8 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
rc = cap_size;
goto out_put;
}
+ cap_size = ALIGN(cap_size, sizeof(u64));
+
if (last_cap && info.argsz >= total_cap_size &&
put_user(total_cap_size, &last_cap->next)) {
rc = -EFAULT;
Thanks,
Jason
On Tue, Aug 08, 2023 at 08:29:25PM -0300, Jason Gunthorpe wrote:
> On Thu, Aug 03, 2023 at 03:18:23PM -0600, Alex Williamson wrote:
>
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index 902f06e52c48..2d074cbd371d 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -1362,6 +1362,8 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
> > void *buf;
> > struct vfio_info_cap_header *header, *tmp;
> >
> > + size = ALIGN(size, sizeof(u64));
> > +
> > buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
> > if (!buf) {
> > kfree(caps->buf);
> > @@ -1395,6 +1397,8 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
> > struct vfio_info_cap_header *tmp;
> > void *buf = (void *)caps->buf;
> >
> > + WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
> > +
> > for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
> > tmp->next += offset;
> > }
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index fa06e3eb4955..fd2761841ffe 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -217,6 +217,7 @@ struct vfio_device_info {
> > __u32 num_regions; /* Max region index + 1 */
> > __u32 num_irqs; /* Max IRQ index + 1 */
> > __u32 cap_offset; /* Offset within info struct of first cap */
> > + __u32 pad; /* Size must be aligned for caps */
> > };
> > #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7)
> >
> > @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info {
> > #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
> > __u64 iova_pgsizes; /* Bitmap of supported page sizes */
> > __u32 cap_offset; /* Offset within info struct of first cap */
> > + __u32 pad; /* Size must be aligned for caps */
> > };
>
> IMHO this is partially being caused by not using __aligned_u64 for the
> other __u64's in the same struct..
>
> Both of these structs have u64s in them and many arches will
> automatically add the above padding. __aligned_u64 will force the
> reset to do it, and then making padding explicit as you have done will
> make it really true.
>
> This is a subtle x64/x32 compatability issue also. It is probably best
> just to do the change across the whole header file.
I will send a separate series that switches the struct definitions to
__aligned_u64.
> Please also include the matching hunk for iommufd:
>
> --- a/drivers/iommu/iommufd/vfio_compat.c
> +++ b/drivers/iommu/iommufd/vfio_compat.c
> @@ -483,6 +483,8 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
> rc = cap_size;
> goto out_put;
> }
> + cap_size = ALIGN(cap_size, sizeof(u64));
> +
> if (last_cap && info.argsz >= total_cap_size &&
> put_user(total_cap_size, &last_cap->next)) {
> rc = -EFAULT;
Okay, will fix.
Stefan