Currently the size of hypercall buffers allocated via
/dev/xen/hypercall is limited to a default of 64 memory pages. For live
migration of guests this might be too small as the page dirty bitmask
needs to be sized according to the size of the guest. This means
migrating a 8GB sized guest is already exhausting the default buffer
size for the dirty bitmap.
There is no sensible way to set a sane limit, so just remove it
completely. The device node's usage is limited to root anyway, so there
is no additional DOS scenario added by allowing unlimited buffers.
While at it make the error path for the -ENOMEM case a little bit
cleaner by setting n_pages to the number of successfully allocated
pages instead of the target size.
Fixes: c51b3c639e01f2 ("xen: add new hypercall buffer mapping device")
Cc: <[email protected]> #4.18
Signed-off-by: Juergen Gross <[email protected]>
---
drivers/xen/privcmd-buf.c | 22 ++++------------------
1 file changed, 4 insertions(+), 18 deletions(-)
diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c
index df1ed37c3269..de01a6d0059d 100644
--- a/drivers/xen/privcmd-buf.c
+++ b/drivers/xen/privcmd-buf.c
@@ -21,15 +21,9 @@
MODULE_LICENSE("GPL");
-static unsigned int limit = 64;
-module_param(limit, uint, 0644);
-MODULE_PARM_DESC(limit, "Maximum number of pages that may be allocated by "
- "the privcmd-buf device per open file");
-
struct privcmd_buf_private {
struct mutex lock;
struct list_head list;
- unsigned int allocated;
};
struct privcmd_buf_vma_private {
@@ -60,13 +54,10 @@ static void privcmd_buf_vmapriv_free(struct privcmd_buf_vma_private *vma_priv)
{
unsigned int i;
- vma_priv->file_priv->allocated -= vma_priv->n_pages;
-
list_del(&vma_priv->list);
for (i = 0; i < vma_priv->n_pages; i++)
- if (vma_priv->pages[i])
- __free_page(vma_priv->pages[i]);
+ __free_page(vma_priv->pages[i]);
kfree(vma_priv);
}
@@ -146,8 +137,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned int i;
int ret = 0;
- if (!(vma->vm_flags & VM_SHARED) || count > limit ||
- file_priv->allocated + count > limit)
+ if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
vma_priv = kzalloc(sizeof(*vma_priv) + count * sizeof(void *),
@@ -155,19 +145,15 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
if (!vma_priv)
return -ENOMEM;
- vma_priv->n_pages = count;
- count = 0;
- for (i = 0; i < vma_priv->n_pages; i++) {
+ for (i = 0; i < count; i++) {
vma_priv->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!vma_priv->pages[i])
break;
- count++;
+ vma_priv->n_pages++;
}
mutex_lock(&file_priv->lock);
- file_priv->allocated += count;
-
vma_priv->file_priv = file_priv;
vma_priv->users = 1;
--
2.16.4
>>> Juergen Gross <[email protected]> 11/01/18 1:34 PM >>>
>Currently the size of hypercall buffers allocated via
>/dev/xen/hypercall is limited to a default of 64 memory pages. For live
>migration of guests this might be too small as the page dirty bitmask
>needs to be sized according to the size of the guest. This means
>migrating a 8GB sized guest is already exhausting the default buffer
>size for the dirty bitmap.
>
>There is no sensible way to set a sane limit, so just remove it
>completely. The device node's usage is limited to root anyway, so there
>is no additional DOS scenario added by allowing unlimited buffers.
But is this setting of permissions what we want long term? What about a
de-privileged qemu, which still needs to be able to issue at least dm-op
hypercalls?
Jan
On 01/11/2018 15:18, Jan Beulich wrote:
>>>> Juergen Gross <[email protected]> 11/01/18 1:34 PM >>>
>> Currently the size of hypercall buffers allocated via
>> /dev/xen/hypercall is limited to a default of 64 memory pages. For live
>> migration of guests this might be too small as the page dirty bitmask
>> needs to be sized according to the size of the guest. This means
>> migrating a 8GB sized guest is already exhausting the default buffer
>> size for the dirty bitmap.
>>
>> There is no sensible way to set a sane limit, so just remove it
>> completely. The device node's usage is limited to root anyway, so there
>> is no additional DOS scenario added by allowing unlimited buffers.
>
> But is this setting of permissions what we want long term? What about a
> de-privileged qemu, which still needs to be able to issue at least dm-op
> hypercalls?
Wouldn't that qemu have opened the node while still being privileged?
Juergen
On 01/11/18 14:18, Jan Beulich wrote:
>>>> Juergen Gross <[email protected]> 11/01/18 1:34 PM >>>
>> Currently the size of hypercall buffers allocated via
>> /dev/xen/hypercall is limited to a default of 64 memory pages. For live
>> migration of guests this might be too small as the page dirty bitmask
>> needs to be sized according to the size of the guest. This means
>> migrating a 8GB sized guest is already exhausting the default buffer
>> size for the dirty bitmap.
>>
>> There is no sensible way to set a sane limit, so just remove it
>> completely. The device node's usage is limited to root anyway, so there
>> is no additional DOS scenario added by allowing unlimited buffers.
> But is this setting of permissions what we want long term? What about a
> de-privileged qemu, which still needs to be able to issue at least dm-op
> hypercalls?
dm-op very deliberately doesn't have hypercall bounce buffers like this.
The kernel is responsible for ensuring that the buffer list passed in
the ioctl is safe memory, whether this is bouncing the buffers itself,
or simply ensuring that the underlying userspace memory won't disappear
across the hypercall.
~Andrew
>>> Juergen Gross <[email protected]> 11/01/18 3:23 PM >>>
>On 01/11/2018 15:18, Jan Beulich wrote:
>>>>> Juergen Gross <[email protected]> 11/01/18 1:34 PM >>>
>>> Currently the size of hypercall buffers allocated via
>>> /dev/xen/hypercall is limited to a default of 64 memory pages. For live
>>> migration of guests this might be too small as the page dirty bitmask
>>> needs to be sized according to the size of the guest. This means
>>> migrating a 8GB sized guest is already exhausting the default buffer
>>> size for the dirty bitmap.
>>>
>>> There is no sensible way to set a sane limit, so just remove it
>>> completely. The device node's usage is limited to root anyway, so there
>>> is no additional DOS scenario added by allowing unlimited buffers.
>>
>> But is this setting of permissions what we want long term? What about a
>> de-privileged qemu, which still needs to be able to issue at least dm-op
>> hypercalls?
>
>Wouldn't that qemu have opened the node while still being privileged?
Possibly, but how does this help? As soon as it's unprivileged it must not
be able to hog resources anymore.
Anyway, with Andrew's reply my point may be irrelevant, but I have to
admit I'm not entirely sure.
Jan
Ping?
Jan's remark regarding de-privileged qemu is no issue as the hypercall
node is being closed by the de-privilege library function.
Juergen
On 01/11/2018 13:33, Juergen Gross wrote:
> Currently the size of hypercall buffers allocated via
> /dev/xen/hypercall is limited to a default of 64 memory pages. For live
> migration of guests this might be too small as the page dirty bitmask
> needs to be sized according to the size of the guest. This means
> migrating a 8GB sized guest is already exhausting the default buffer
> size for the dirty bitmap.
>
> There is no sensible way to set a sane limit, so just remove it
> completely. The device node's usage is limited to root anyway, so there
> is no additional DOS scenario added by allowing unlimited buffers.
>
> While at it make the error path for the -ENOMEM case a little bit
> cleaner by setting n_pages to the number of successfully allocated
> pages instead of the target size.
>
> Fixes: c51b3c639e01f2 ("xen: add new hypercall buffer mapping device")
> Cc: <[email protected]> #4.18
> Signed-off-by: Juergen Gross <[email protected]>
> ---
> drivers/xen/privcmd-buf.c | 22 ++++------------------
> 1 file changed, 4 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c
> index df1ed37c3269..de01a6d0059d 100644
> --- a/drivers/xen/privcmd-buf.c
> +++ b/drivers/xen/privcmd-buf.c
> @@ -21,15 +21,9 @@
>
> MODULE_LICENSE("GPL");
>
> -static unsigned int limit = 64;
> -module_param(limit, uint, 0644);
> -MODULE_PARM_DESC(limit, "Maximum number of pages that may be allocated by "
> - "the privcmd-buf device per open file");
> -
> struct privcmd_buf_private {
> struct mutex lock;
> struct list_head list;
> - unsigned int allocated;
> };
>
> struct privcmd_buf_vma_private {
> @@ -60,13 +54,10 @@ static void privcmd_buf_vmapriv_free(struct privcmd_buf_vma_private *vma_priv)
> {
> unsigned int i;
>
> - vma_priv->file_priv->allocated -= vma_priv->n_pages;
> -
> list_del(&vma_priv->list);
>
> for (i = 0; i < vma_priv->n_pages; i++)
> - if (vma_priv->pages[i])
> - __free_page(vma_priv->pages[i]);
> + __free_page(vma_priv->pages[i]);
>
> kfree(vma_priv);
> }
> @@ -146,8 +137,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
> unsigned int i;
> int ret = 0;
>
> - if (!(vma->vm_flags & VM_SHARED) || count > limit ||
> - file_priv->allocated + count > limit)
> + if (!(vma->vm_flags & VM_SHARED))
> return -EINVAL;
>
> vma_priv = kzalloc(sizeof(*vma_priv) + count * sizeof(void *),
> @@ -155,19 +145,15 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
> if (!vma_priv)
> return -ENOMEM;
>
> - vma_priv->n_pages = count;
> - count = 0;
> - for (i = 0; i < vma_priv->n_pages; i++) {
> + for (i = 0; i < count; i++) {
> vma_priv->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> if (!vma_priv->pages[i])
> break;
> - count++;
> + vma_priv->n_pages++;
> }
>
> mutex_lock(&file_priv->lock);
>
> - file_priv->allocated += count;
> -
> vma_priv->file_priv = file_priv;
> vma_priv->users = 1;
>
>
On 11/9/18 2:03 AM, Juergen Gross wrote:
> Ping?
>
> Jan's remark regarding de-privileged qemu is no issue as the hypercall
> node is being closed by the de-privilege library function.
Reviewed-by: Boris Ostrovsky <[email protected]>