2012-10-19 07:37:45

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
the pieces of io data can be collected and write them to the guest memory
or MMIO together.

Unfortunately, kvm splits the mmio access into 8 bytes and store them to
vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
will cause vcpu->mmio_fragments overflow

The bug can be exposed by isapc (-M isapc):

[23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
[ ......]
[23154.858083] Call Trace:
[23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
[23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
[23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]


Actually, we can use one mmio_fragment to store a large mmio access for the
mmio access is always continuous then split it when we pass the mmio-exit-info
to userspace. After that, we only need two entries to store mmio info for
the cross-mmio pages access

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 127 +++++++++++++++++++++++++++++-----------------
include/linux/kvm_host.h | 16 +-----
2 files changed, 84 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b90dd5..41ceb51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
-
- memcpy(vcpu->run->mmio.data, frag->data, frag->len);
return X86EMUL_CONTINUE;
}

@@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = {
.write = true,
};

+static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ unsigned *len, void **data)
+{
+ struct kvm_mmio_fragment *frag;
+ int cur = vcpu->mmio_cur_fragment;
+
+ if (cur >= vcpu->mmio_nr_fragments)
+ return false;
+
+ frag = &vcpu->mmio_fragments[cur];
+ if (frag->pos >= frag->len) {
+ if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments)
+ return false;
+ frag++;
+ }
+
+ *gpa = frag->gpa + frag->pos;
+ *data = frag->data + frag->pos;
+ *len = min(8u, frag->len - frag->pos);
+ return true;
+}
+
+static void complete_current_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmio_fragment *frag;
+ gpa_t gpa;
+ unsigned len;
+ void *data;
+
+ get_current_mmio_info(vcpu, &gpa, &len, &data);
+
+ if (!vcpu->mmio_is_write)
+ memcpy(data, vcpu->run->mmio.data, len);
+
+ /* Increase frag->pos to switch to the next mmio. */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ frag->pos += len;
+}
+
+static bool vcpu_fill_mmio_exit_info(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+ unsigned len;
+ void *data;
+
+ if (!get_current_mmio_info(vcpu, &gpa, &len, &data))
+ return false;
+
+ vcpu->run->mmio.len = len;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ if (vcpu->mmio_is_write)
+ memcpy(vcpu->run->mmio.data, data, len);
+ return true;
+}
+
static int emulator_read_write_onepage(unsigned long addr, void *val,
unsigned int bytes,
struct x86_exception *exception,
@@ -3834,18 +3889,12 @@ mmio:
bytes -= handled;
val += handled;

- while (bytes) {
- unsigned now = min(bytes, 8U);
-
- frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
- frag->gpa = gpa;
- frag->data = val;
- frag->len = now;
-
- gpa += now;
- val += now;
- bytes -= now;
- }
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->pos = 0;
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
return X86EMUL_CONTINUE;
}

@@ -3855,7 +3904,6 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
const struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- gpa_t gpa;
int rc;

if (ops->read_write_prepare &&
@@ -3887,17 +3935,13 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
if (!vcpu->mmio_nr_fragments)
return rc;

- gpa = vcpu->mmio_fragments[0].gpa;
-
vcpu->mmio_needed = 1;
vcpu->mmio_cur_fragment = 0;
+ vcpu->mmio_is_write = ops->write;

- vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = gpa;
-
- return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+ vcpu_fill_mmio_exit_info(vcpu);
+ return ops->read_write_exit_mmio(vcpu, vcpu->mmio_fragments[0].gpa,
+ val, bytes);
}

static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -5524,43 +5568,34 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu)
*
* read:
* for each fragment
- * write gpa, len
- * exit
- * copy data
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
* execute insn
*
* write:
* for each fragment
- * write gpa, len
- * copy data
- * exit
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
*/
static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
{
- struct kvm_run *run = vcpu->run;
- struct kvm_mmio_fragment *frag;
-
BUG_ON(!vcpu->mmio_needed);

- /* Complete previous fragment */
- frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
- if (!vcpu->mmio_is_write)
- memcpy(frag->data, run->mmio.data, frag->len);
- if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ complete_current_mmio(vcpu);
+
+ /* Initiate next fragment */
+ if (!vcpu_fill_mmio_exit_info(vcpu)) {
vcpu->mmio_needed = 0;
if (vcpu->mmio_is_write)
return 1;
vcpu->mmio_read_completed = 1;
return complete_emulated_io(vcpu);
}
- /* Initiate next fragment */
- ++frag;
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = frag->gpa;
- if (vcpu->mmio_is_write)
- memcpy(run->mmio.data, frag->data, frag->len);
- run->mmio.len = frag->len;
- run->mmio.is_write = vcpu->mmio_is_write;
+
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
return 0;
}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e235068..f6c3c2f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -42,19 +42,8 @@
*/
#define KVM_MEMSLOT_INVALID (1UL << 16)

-/*
- * If we support unaligned MMIO, at most one fragment will be split into two:
- */
-#ifdef KVM_UNALIGNED_MMIO
-# define KVM_EXTRA_MMIO_FRAGMENTS 1
-#else
-# define KVM_EXTRA_MMIO_FRAGMENTS 0
-#endif
-
-#define KVM_USER_MMIO_SIZE 8
-
-#define KVM_MAX_MMIO_FRAGMENTS \
- (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
+/* Two fragments for cross MMIO pages. */
+#define KVM_MAX_MMIO_FRAGMENTS 2

/*
* For the normal pfn, the highest 12 bits should be zero,
@@ -203,6 +192,7 @@ struct kvm_mmio_fragment {
gpa_t gpa;
void *data;
unsigned len;
+ unsigned pos;
};

struct kvm_vcpu {
--
1.7.7.6


2012-10-19 07:39:17

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH] emulator test: add "rep ins" mmio access test

Add the test to trigger the bug that "rep ins" causes vcpu->mmio_fragments
overflow overflow while move large data from ioport to MMIO

Signed-off-by: Xiao Guangrong <[email protected]>
---
x86/emulator.c | 14 ++++++++++++++
1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/x86/emulator.c b/x86/emulator.c
index 24b33d1..0735405 100644
--- a/x86/emulator.c
+++ b/x86/emulator.c
@@ -731,6 +731,18 @@ static void test_crosspage_mmio(volatile uint8_t *mem)
report("cross-page mmio write", mem[4095] == 0xaa && mem[4096] == 0x88);
}

+static void test_string_io_mmio(volatile uint8_t *mem)
+{
+ /* Cross MMIO pages.*/
+ volatile uint8_t *mmio = mem + 4032;
+
+ asm volatile("outw %%ax, %%dx \n\t" : : "a"(0x9999), "d"(TESTDEV_IO_PORT));
+
+ asm volatile ("cld; rep insb" : : "d" (TESTDEV_IO_PORT), "D" (mmio), "c" (1024));
+
+ report("string_io_mmio", mmio[1023] == 0x99);
+}
+
static void test_lgdt_lidt(volatile uint8_t *mem)
{
struct descriptor_table_ptr orig, fresh = {};
@@ -878,6 +890,8 @@ int main()

test_crosspage_mmio(mem);

+ test_string_io_mmio(mem);
+
printf("\nSUMMARY: %d tests, %d failures\n", tests, fails);
return fails ? 1 : 0;
}
--
1.7.7.6

2012-10-22 09:16:23

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
> the pieces of io data can be collected and write them to the guest memory
> or MMIO together.
>
> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
> will cause vcpu->mmio_fragments overflow
>
> The bug can be exposed by isapc (-M isapc):
>
> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
> [ ......]
> [23154.858083] Call Trace:
> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
>
>
> Actually, we can use one mmio_fragment to store a large mmio access for the
> mmio access is always continuous then split it when we pass the mmio-exit-info
> to userspace. After that, we only need two entries to store mmio info for
> the cross-mmio pages access
>
I wonder can we put the data into coalesced mmio buffer instead of
exiting for each 8 bytes? Is it worth the complexity?

> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/x86.c | 127 +++++++++++++++++++++++++++++-----------------
> include/linux/kvm_host.h | 16 +-----
> 2 files changed, 84 insertions(+), 59 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 8b90dd5..41ceb51 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
> static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
> void *val, int bytes)
> {
> - struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
> -
> - memcpy(vcpu->run->mmio.data, frag->data, frag->len);
> return X86EMUL_CONTINUE;
> }
>
> @@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = {
> .write = true,
> };
>
> +static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa,
> + unsigned *len, void **data)
> +{
> + struct kvm_mmio_fragment *frag;
> + int cur = vcpu->mmio_cur_fragment;
> +
> + if (cur >= vcpu->mmio_nr_fragments)
> + return false;
> +
> + frag = &vcpu->mmio_fragments[cur];
> + if (frag->pos >= frag->len) {
> + if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments)
> + return false;
> + frag++;
> + }
> +
> + *gpa = frag->gpa + frag->pos;
> + *data = frag->data + frag->pos;
> + *len = min(8u, frag->len - frag->pos);
> + return true;
> +}
> +
> +static void complete_current_mmio(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_mmio_fragment *frag;
> + gpa_t gpa;
> + unsigned len;
> + void *data;
> +
> + get_current_mmio_info(vcpu, &gpa, &len, &data);
> +
> + if (!vcpu->mmio_is_write)
> + memcpy(data, vcpu->run->mmio.data, len);
> +
> + /* Increase frag->pos to switch to the next mmio. */
> + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
> + frag->pos += len;
> +}
> +
> +static bool vcpu_fill_mmio_exit_info(struct kvm_vcpu *vcpu)
> +{
> + gpa_t gpa;
> + unsigned len;
> + void *data;
> +
> + if (!get_current_mmio_info(vcpu, &gpa, &len, &data))
> + return false;
> +
> + vcpu->run->mmio.len = len;
> + vcpu->run->mmio.is_write = vcpu->mmio_is_write;
> + vcpu->run->exit_reason = KVM_EXIT_MMIO;
> + vcpu->run->mmio.phys_addr = gpa;
> +
> + if (vcpu->mmio_is_write)
> + memcpy(vcpu->run->mmio.data, data, len);
> + return true;
> +}
> +
> static int emulator_read_write_onepage(unsigned long addr, void *val,
> unsigned int bytes,
> struct x86_exception *exception,
> @@ -3834,18 +3889,12 @@ mmio:
> bytes -= handled;
> val += handled;
>
> - while (bytes) {
> - unsigned now = min(bytes, 8U);
> -
> - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
> - frag->gpa = gpa;
> - frag->data = val;
> - frag->len = now;
> -
> - gpa += now;
> - val += now;
> - bytes -= now;
> - }
> + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
> + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
> + frag->pos = 0;
> + frag->gpa = gpa;
> + frag->data = val;
> + frag->len = bytes;
> return X86EMUL_CONTINUE;
> }
>
> @@ -3855,7 +3904,6 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
> const struct read_write_emulator_ops *ops)
> {
> struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
> - gpa_t gpa;
> int rc;
>
> if (ops->read_write_prepare &&
> @@ -3887,17 +3935,13 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
> if (!vcpu->mmio_nr_fragments)
> return rc;
>
> - gpa = vcpu->mmio_fragments[0].gpa;
> -
> vcpu->mmio_needed = 1;
> vcpu->mmio_cur_fragment = 0;
> + vcpu->mmio_is_write = ops->write;
>
> - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
> - vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
> - vcpu->run->exit_reason = KVM_EXIT_MMIO;
> - vcpu->run->mmio.phys_addr = gpa;
> -
> - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
> + vcpu_fill_mmio_exit_info(vcpu);
> + return ops->read_write_exit_mmio(vcpu, vcpu->mmio_fragments[0].gpa,
> + val, bytes);
> }
>
> static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
> @@ -5524,43 +5568,34 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu)
> *
> * read:
> * for each fragment
> - * write gpa, len
> - * exit
> - * copy data
> + * for each mmio piece in the fragment
> + * write gpa, len
> + * exit
> + * copy data
> * execute insn
> *
> * write:
> * for each fragment
> - * write gpa, len
> - * copy data
> - * exit
> + * for each mmio piece in the fragment
> + * write gpa, len
> + * copy data
> + * exit
> */
> static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
> {
> - struct kvm_run *run = vcpu->run;
> - struct kvm_mmio_fragment *frag;
> -
> BUG_ON(!vcpu->mmio_needed);
>
> - /* Complete previous fragment */
> - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
> - if (!vcpu->mmio_is_write)
> - memcpy(frag->data, run->mmio.data, frag->len);
> - if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
> + complete_current_mmio(vcpu);
> +
> + /* Initiate next fragment */
> + if (!vcpu_fill_mmio_exit_info(vcpu)) {
> vcpu->mmio_needed = 0;
> if (vcpu->mmio_is_write)
> return 1;
> vcpu->mmio_read_completed = 1;
> return complete_emulated_io(vcpu);
> }
> - /* Initiate next fragment */
> - ++frag;
> - run->exit_reason = KVM_EXIT_MMIO;
> - run->mmio.phys_addr = frag->gpa;
> - if (vcpu->mmio_is_write)
> - memcpy(run->mmio.data, frag->data, frag->len);
> - run->mmio.len = frag->len;
> - run->mmio.is_write = vcpu->mmio_is_write;
> +
> vcpu->arch.complete_userspace_io = complete_emulated_mmio;
> return 0;
> }
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index e235068..f6c3c2f 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -42,19 +42,8 @@
> */
> #define KVM_MEMSLOT_INVALID (1UL << 16)
>
> -/*
> - * If we support unaligned MMIO, at most one fragment will be split into two:
> - */
> -#ifdef KVM_UNALIGNED_MMIO
> -# define KVM_EXTRA_MMIO_FRAGMENTS 1
> -#else
> -# define KVM_EXTRA_MMIO_FRAGMENTS 0
> -#endif
> -
> -#define KVM_USER_MMIO_SIZE 8
> -
> -#define KVM_MAX_MMIO_FRAGMENTS \
> - (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS)
> +/* Two fragments for cross MMIO pages. */
> +#define KVM_MAX_MMIO_FRAGMENTS 2
>
> /*
> * For the normal pfn, the highest 12 bits should be zero,
> @@ -203,6 +192,7 @@ struct kvm_mmio_fragment {
> gpa_t gpa;
> void *data;
> unsigned len;
> + unsigned pos;
> };
>
> struct kvm_vcpu {
> --
> 1.7.7.6

--
Gleb.

2012-10-22 11:09:48

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 05:16 PM, Gleb Natapov wrote:
> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
>> the pieces of io data can be collected and write them to the guest memory
>> or MMIO together.
>>
>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
>> will cause vcpu->mmio_fragments overflow
>>
>> The bug can be exposed by isapc (-M isapc):
>>
>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
>> [ ......]
>> [23154.858083] Call Trace:
>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
>>
>>
>> Actually, we can use one mmio_fragment to store a large mmio access for the
>> mmio access is always continuous then split it when we pass the mmio-exit-info
>> to userspace. After that, we only need two entries to store mmio info for
>> the cross-mmio pages access
>>
> I wonder can we put the data into coalesced mmio buffer instead of

If we put all mmio data into coalesced buffer, we should:
- ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register
all mmio regions.

- even if the MMIO region is not used by emulated-device, it also need to be
registered.

It will breaks old version userspace program.

> exiting for each 8 bytes? Is it worth the complexity?

Simpler way is always better but i failed, so i appreciate your guys comments.

2012-10-22 11:23:18

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote:
> On 10/22/2012 05:16 PM, Gleb Natapov wrote:
> > On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
> >> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
> >> the pieces of io data can be collected and write them to the guest memory
> >> or MMIO together.
> >>
> >> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
> >> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
> >> will cause vcpu->mmio_fragments overflow
> >>
> >> The bug can be exposed by isapc (-M isapc):
> >>
> >> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
> >> [ ......]
> >> [23154.858083] Call Trace:
> >> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
> >> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
> >> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
> >>
> >>
> >> Actually, we can use one mmio_fragment to store a large mmio access for the
> >> mmio access is always continuous then split it when we pass the mmio-exit-info
> >> to userspace. After that, we only need two entries to store mmio info for
> >> the cross-mmio pages access
> >>
> > I wonder can we put the data into coalesced mmio buffer instead of
>
> If we put all mmio data into coalesced buffer, we should:
> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register
> all mmio regions.
>
It appears to not be so.
Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from
KVM_RUN which looks like this:

void kvm_flush_coalesced_mmio_buffer(void)
{
KVMState *s = kvm_state;

if (s->coalesced_flush_in_progress) {
return;
}

s->coalesced_flush_in_progress = true;

if (s->coalesced_mmio_ring) {
struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
while (ring->first != ring->last) {
struct kvm_coalesced_mmio *ent;

ent = &ring->coalesced_mmio[ring->first];

cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
smp_wmb();
ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
}
}

s->coalesced_flush_in_progress = false;
}

Nowhere in this function we check that MMIO region was registered with
KVM_REGISTER_COALESCED_MMIO. We do not even check that the address is
MMIO.

> - even if the MMIO region is not used by emulated-device, it also need to be
> registered.
Same. I think writes to non registered region will be discarded.

>
> It will breaks old version userspace program.
>
> > exiting for each 8 bytes? Is it worth the complexity?
>
> Simpler way is always better but i failed, so i appreciate your guys comments.
>
Why have you failed? Exiting for each 8 bytes is infinitely better than
buffer overflow. My question about complexity was towards theoretically
more complex code that will use coalesced MMIO buffer.

--
Gleb.

2012-10-22 11:43:18

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 01:35:56PM +0200, Jan Kiszka wrote:
> On 2012-10-22 13:23, Gleb Natapov wrote:
> > On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote:
> >> On 10/22/2012 05:16 PM, Gleb Natapov wrote:
> >>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
> >>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
> >>>> the pieces of io data can be collected and write them to the guest memory
> >>>> or MMIO together.
> >>>>
> >>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
> >>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
> >>>> will cause vcpu->mmio_fragments overflow
> >>>>
> >>>> The bug can be exposed by isapc (-M isapc):
> >>>>
> >>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
> >>>> [ ......]
> >>>> [23154.858083] Call Trace:
> >>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
> >>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
> >>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
> >>>>
> >>>>
> >>>> Actually, we can use one mmio_fragment to store a large mmio access for the
> >>>> mmio access is always continuous then split it when we pass the mmio-exit-info
> >>>> to userspace. After that, we only need two entries to store mmio info for
> >>>> the cross-mmio pages access
> >>>>
> >>> I wonder can we put the data into coalesced mmio buffer instead of
> >>
> >> If we put all mmio data into coalesced buffer, we should:
> >> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register
> >> all mmio regions.
> >>
> > It appears to not be so.
> > Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from
> > KVM_RUN which looks like this:
>
> Nope, no longer, only on accesses to devices that actually use such
> regions (and there are only two ATM). The current design of a global
> coalesced mmio ring is horrible /wrt latency.
>
Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
is gone. So this will break new userspace, not old. By global you mean
shared between devices (or memory regions)?

--
Gleb.

2012-10-22 11:45:47

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 13:43, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 01:35:56PM +0200, Jan Kiszka wrote:
>> On 2012-10-22 13:23, Gleb Natapov wrote:
>>> On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote:
>>>> On 10/22/2012 05:16 PM, Gleb Natapov wrote:
>>>>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
>>>>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
>>>>>> the pieces of io data can be collected and write them to the guest memory
>>>>>> or MMIO together.
>>>>>>
>>>>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
>>>>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
>>>>>> will cause vcpu->mmio_fragments overflow
>>>>>>
>>>>>> The bug can be exposed by isapc (-M isapc):
>>>>>>
>>>>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
>>>>>> [ ......]
>>>>>> [23154.858083] Call Trace:
>>>>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
>>>>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
>>>>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
>>>>>>
>>>>>>
>>>>>> Actually, we can use one mmio_fragment to store a large mmio access for the
>>>>>> mmio access is always continuous then split it when we pass the mmio-exit-info
>>>>>> to userspace. After that, we only need two entries to store mmio info for
>>>>>> the cross-mmio pages access
>>>>>>
>>>>> I wonder can we put the data into coalesced mmio buffer instead of
>>>>
>>>> If we put all mmio data into coalesced buffer, we should:
>>>> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register
>>>> all mmio regions.
>>>>
>>> It appears to not be so.
>>> Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from
>>> KVM_RUN which looks like this:
>>
>> Nope, no longer, only on accesses to devices that actually use such
>> regions (and there are only two ATM). The current design of a global
>> coalesced mmio ring is horrible /wrt latency.
>>
> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
> is gone. So this will break new userspace, not old. By global you mean
> shared between devices (or memory regions)?

Yes. We only have a single ring per VM, so we cannot flush multi-second
VGA access separately from other devices. In theory solvable by
introducing per-region rings that can be driven separately.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 11:53:23

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 13:23, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 07:09:38PM +0800, Xiao Guangrong wrote:
>> On 10/22/2012 05:16 PM, Gleb Natapov wrote:
>>> On Fri, Oct 19, 2012 at 03:37:32PM +0800, Xiao Guangrong wrote:
>>>> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
>>>> the pieces of io data can be collected and write them to the guest memory
>>>> or MMIO together.
>>>>
>>>> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
>>>> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
>>>> will cause vcpu->mmio_fragments overflow
>>>>
>>>> The bug can be exposed by isapc (-M isapc):
>>>>
>>>> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
>>>> [ ......]
>>>> [23154.858083] Call Trace:
>>>> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
>>>> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
>>>> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
>>>>
>>>>
>>>> Actually, we can use one mmio_fragment to store a large mmio access for the
>>>> mmio access is always continuous then split it when we pass the mmio-exit-info
>>>> to userspace. After that, we only need two entries to store mmio info for
>>>> the cross-mmio pages access
>>>>
>>> I wonder can we put the data into coalesced mmio buffer instead of
>>
>> If we put all mmio data into coalesced buffer, we should:
>> - ensure the userspace program uses KVM_REGISTER_COALESCED_MMIO to register
>> all mmio regions.
>>
> It appears to not be so.
> Userspace calls kvm_flush_coalesced_mmio_buffer() after returning from
> KVM_RUN which looks like this:

Nope, no longer, only on accesses to devices that actually use such
regions (and there are only two ATM). The current design of a global
coalesced mmio ring is horrible /wrt latency.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 12:18:59

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 01:45 PM, Jan Kiszka wrote:

>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
>> is gone. So this will break new userspace, not old. By global you mean
>> shared between devices (or memory regions)?
>
> Yes. We only have a single ring per VM, so we cannot flush multi-second
> VGA access separately from other devices. In theory solvable by
> introducing per-region rings that can be driven separately.

But in practice unneeded. Real time VMs can disable coalescing and not
use planar VGA modes.


--
error compiling committee.c: too many arguments to function

2012-10-22 12:45:45

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 14:18, Avi Kivity wrote:
> On 10/22/2012 01:45 PM, Jan Kiszka wrote:
>
>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
>>> is gone. So this will break new userspace, not old. By global you mean
>>> shared between devices (or memory regions)?
>>
>> Yes. We only have a single ring per VM, so we cannot flush multi-second
>> VGA access separately from other devices. In theory solvable by
>> introducing per-region rings that can be driven separately.
>
> But in practice unneeded. Real time VMs can disable coalescing and not
> use planar VGA modes.

A) At least right now, we do not differentiate between the VGA modes and
if flushing is needed. So that device is generally taboo for RT cores of
the VM.
B) We need to disable coalescing in E1000 as well - if we want to use
that model.
C) Gleb seems to propose using coalescing far beyond those two use cases.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 12:53:08

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote:
> On 2012-10-22 14:18, Avi Kivity wrote:
> > On 10/22/2012 01:45 PM, Jan Kiszka wrote:
> >
> >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
> >>> is gone. So this will break new userspace, not old. By global you mean
> >>> shared between devices (or memory regions)?
> >>
> >> Yes. We only have a single ring per VM, so we cannot flush multi-second
> >> VGA access separately from other devices. In theory solvable by
> >> introducing per-region rings that can be driven separately.
> >
> > But in practice unneeded. Real time VMs can disable coalescing and not
> > use planar VGA modes.
>
> A) At least right now, we do not differentiate between the VGA modes and
> if flushing is needed. So that device is generally taboo for RT cores of
> the VM.
> B) We need to disable coalescing in E1000 as well - if we want to use
> that model.
> C) Gleb seems to propose using coalescing far beyond those two use cases.
>
Since the userspace change is needed the idea is dead, but if we could
implement it I do not see how it can hurt the latency if it would be the
only mechanism to use coalesced mmio buffer. Checking that the ring buffer
is empty is cheap and if it is not empty it means that kernel just saved
you a lot of 8 bytes exists so even after iterating over all the entries there
you still saved a lot of time.

--
Gleb.

2012-10-22 12:55:39

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 14:53, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote:
>> On 2012-10-22 14:18, Avi Kivity wrote:
>>> On 10/22/2012 01:45 PM, Jan Kiszka wrote:
>>>
>>>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
>>>>> is gone. So this will break new userspace, not old. By global you mean
>>>>> shared between devices (or memory regions)?
>>>>
>>>> Yes. We only have a single ring per VM, so we cannot flush multi-second
>>>> VGA access separately from other devices. In theory solvable by
>>>> introducing per-region rings that can be driven separately.
>>>
>>> But in practice unneeded. Real time VMs can disable coalescing and not
>>> use planar VGA modes.
>>
>> A) At least right now, we do not differentiate between the VGA modes and
>> if flushing is needed. So that device is generally taboo for RT cores of
>> the VM.
>> B) We need to disable coalescing in E1000 as well - if we want to use
>> that model.
>> C) Gleb seems to propose using coalescing far beyond those two use cases.
>>
> Since the userspace change is needed the idea is dead, but if we could
> implement it I do not see how it can hurt the latency if it would be the
> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> is empty is cheap and if it is not empty it means that kernel just saved
> you a lot of 8 bytes exists so even after iterating over all the entries there
> you still saved a lot of time.

When taking an exit for A, I'm not interesting in flushing stuff for B
unless I have a dependency. Thus, buffers would have to be per device
before extending their use.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 12:55:58

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 02:53 PM, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote:
>> On 2012-10-22 14:18, Avi Kivity wrote:
>> > On 10/22/2012 01:45 PM, Jan Kiszka wrote:
>> >
>> >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
>> >>> is gone. So this will break new userspace, not old. By global you mean
>> >>> shared between devices (or memory regions)?
>> >>
>> >> Yes. We only have a single ring per VM, so we cannot flush multi-second
>> >> VGA access separately from other devices. In theory solvable by
>> >> introducing per-region rings that can be driven separately.
>> >
>> > But in practice unneeded. Real time VMs can disable coalescing and not
>> > use planar VGA modes.
>>
>> A) At least right now, we do not differentiate between the VGA modes and
>> if flushing is needed. So that device is generally taboo for RT cores of
>> the VM.
>> B) We need to disable coalescing in E1000 as well - if we want to use
>> that model.
>> C) Gleb seems to propose using coalescing far beyond those two use cases.
>>
> Since the userspace change is needed the idea is dead, but if we could
> implement it I do not see how it can hurt the latency if it would be the
> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> is empty is cheap and if it is not empty it means that kernel just saved
> you a lot of 8 bytes exists so even after iterating over all the entries there
> you still saved a lot of time.

It's time where the guest cannot take interrupts, and time in a high
priority guest thread that is spent processing low guest priority requests.


--
error compiling committee.c: too many arguments to function

2012-10-22 12:56:51

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 02:45 PM, Jan Kiszka wrote:
> On 2012-10-22 14:18, Avi Kivity wrote:
>> On 10/22/2012 01:45 PM, Jan Kiszka wrote:
>>
>>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
>>>> is gone. So this will break new userspace, not old. By global you mean
>>>> shared between devices (or memory regions)?
>>>
>>> Yes. We only have a single ring per VM, so we cannot flush multi-second
>>> VGA access separately from other devices. In theory solvable by
>>> introducing per-region rings that can be driven separately.
>>
>> But in practice unneeded. Real time VMs can disable coalescing and not
>> use planar VGA modes.
>
> A) At least right now, we do not differentiate between the VGA modes and
> if flushing is needed. So that device is generally taboo for RT cores of
> the VM.

In non-planar modes the memory will be direct mapped, which overrides
coalescing (since kvm or qemu never see an exit).

> B) We need to disable coalescing in E1000 as well - if we want to use
> that model.

True.

--
error compiling committee.c: too many arguments to function

2012-10-22 12:58:33

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 02:55 PM, Jan Kiszka wrote:
>> Since the userspace change is needed the idea is dead, but if we could
>> implement it I do not see how it can hurt the latency if it would be the
>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
>> is empty is cheap and if it is not empty it means that kernel just saved
>> you a lot of 8 bytes exists so even after iterating over all the entries there
>> you still saved a lot of time.
>
> When taking an exit for A, I'm not interesting in flushing stuff for B
> unless I have a dependency. Thus, buffers would have to be per device
> before extending their use.

Any mmio exit has to flush everything. For example a DMA caused by an
e1000 write has to see any writes to the framebuffer, in case the guest
is transmitting its framebuffer to the outside world.

--
error compiling committee.c: too many arguments to function

2012-10-22 12:59:00

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 02:55:14PM +0200, Jan Kiszka wrote:
> On 2012-10-22 14:53, Gleb Natapov wrote:
> > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote:
> >> On 2012-10-22 14:18, Avi Kivity wrote:
> >>> On 10/22/2012 01:45 PM, Jan Kiszka wrote:
> >>>
> >>>>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
> >>>>> is gone. So this will break new userspace, not old. By global you mean
> >>>>> shared between devices (or memory regions)?
> >>>>
> >>>> Yes. We only have a single ring per VM, so we cannot flush multi-second
> >>>> VGA access separately from other devices. In theory solvable by
> >>>> introducing per-region rings that can be driven separately.
> >>>
> >>> But in practice unneeded. Real time VMs can disable coalescing and not
> >>> use planar VGA modes.
> >>
> >> A) At least right now, we do not differentiate between the VGA modes and
> >> if flushing is needed. So that device is generally taboo for RT cores of
> >> the VM.
> >> B) We need to disable coalescing in E1000 as well - if we want to use
> >> that model.
> >> C) Gleb seems to propose using coalescing far beyond those two use cases.
> >>
> > Since the userspace change is needed the idea is dead, but if we could
> > implement it I do not see how it can hurt the latency if it would be the
> > only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> > is empty is cheap and if it is not empty it means that kernel just saved
> > you a lot of 8 bytes exists so even after iterating over all the entries there
> > you still saved a lot of time.
>
> When taking an exit for A, I'm not interesting in flushing stuff for B
> unless I have a dependency. Thus, buffers would have to be per device
> before extending their use.
>
Buts this is not what will happen (in the absence of other users of
coalesced mmio). What will happen is instead of taking 200 exists for B
you will take 1 exit for B.

--
Gleb.

2012-10-22 13:01:35

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 02:55:24PM +0200, Avi Kivity wrote:
> On 10/22/2012 02:53 PM, Gleb Natapov wrote:
> > On Mon, Oct 22, 2012 at 02:45:37PM +0200, Jan Kiszka wrote:
> >> On 2012-10-22 14:18, Avi Kivity wrote:
> >> > On 10/22/2012 01:45 PM, Jan Kiszka wrote:
> >> >
> >> >>> Indeed. git pull, recheck and call for kvm_flush_coalesced_mmio_buffer()
> >> >>> is gone. So this will break new userspace, not old. By global you mean
> >> >>> shared between devices (or memory regions)?
> >> >>
> >> >> Yes. We only have a single ring per VM, so we cannot flush multi-second
> >> >> VGA access separately from other devices. In theory solvable by
> >> >> introducing per-region rings that can be driven separately.
> >> >
> >> > But in practice unneeded. Real time VMs can disable coalescing and not
> >> > use planar VGA modes.
> >>
> >> A) At least right now, we do not differentiate between the VGA modes and
> >> if flushing is needed. So that device is generally taboo for RT cores of
> >> the VM.
> >> B) We need to disable coalescing in E1000 as well - if we want to use
> >> that model.
> >> C) Gleb seems to propose using coalescing far beyond those two use cases.
> >>
> > Since the userspace change is needed the idea is dead, but if we could
> > implement it I do not see how it can hurt the latency if it would be the
> > only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> > is empty is cheap and if it is not empty it means that kernel just saved
> > you a lot of 8 bytes exists so even after iterating over all the entries there
> > you still saved a lot of time.
>
> It's time where the guest cannot take interrupts, and time in a high
> priority guest thread that is spent processing low guest priority requests.
>
Proposed fix has exactly same issue. Until all data is transfered to
userspace no interrupt will be served.

--
Gleb.

2012-10-22 13:02:53

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 03:01 PM, Gleb Natapov wrote:

>> It's time where the guest cannot take interrupts, and time in a high
>> priority guest thread that is spent processing low guest priority requests.
>>
> Proposed fix has exactly same issue. Until all data is transfered to
> userspace no interrupt will be served.

For mmio_fragments that is okay. It's the same guest instruction, and
it's still O(1).

It's not okay for general mmio coalescing.


--
error compiling committee.c: too many arguments to function

2012-10-22 13:06:10

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 03:02:22PM +0200, Avi Kivity wrote:
> On 10/22/2012 03:01 PM, Gleb Natapov wrote:
>
> >> It's time where the guest cannot take interrupts, and time in a high
> >> priority guest thread that is spent processing low guest priority requests.
> >>
> > Proposed fix has exactly same issue. Until all data is transfered to
> > userspace no interrupt will be served.
>
> For mmio_fragments that is okay. It's the same guest instruction, and
> it's still O(1).
>
> It's not okay for general mmio coalescing.
>
Ah, so optimizing mmio_fragments transmission to userspace using
dedicated coalesced MMIO buffer should be fine then. Unfortunately,
since we cannot use shared ring buffer that exists now, this is too much
work for small gain that only new QEMU will be able to enjoy.

--
Gleb.

2012-10-22 13:06:15

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 14:58, Avi Kivity wrote:
> On 10/22/2012 02:55 PM, Jan Kiszka wrote:
>>> Since the userspace change is needed the idea is dead, but if we could
>>> implement it I do not see how it can hurt the latency if it would be the
>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
>>> is empty is cheap and if it is not empty it means that kernel just saved
>>> you a lot of 8 bytes exists so even after iterating over all the entries there
>>> you still saved a lot of time.
>>
>> When taking an exit for A, I'm not interesting in flushing stuff for B
>> unless I have a dependency. Thus, buffers would have to be per device
>> before extending their use.
>
> Any mmio exit has to flush everything. For example a DMA caused by an
> e1000 write has to see any writes to the framebuffer, in case the guest
> is transmitting its framebuffer to the outside world.

We already flush when that crazy guest actually accesses the region, no
need to do this unconditionally.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 13:08:18

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote:
> On 2012-10-22 14:58, Avi Kivity wrote:
> > On 10/22/2012 02:55 PM, Jan Kiszka wrote:
> >>> Since the userspace change is needed the idea is dead, but if we could
> >>> implement it I do not see how it can hurt the latency if it would be the
> >>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> >>> is empty is cheap and if it is not empty it means that kernel just saved
> >>> you a lot of 8 bytes exists so even after iterating over all the entries there
> >>> you still saved a lot of time.
> >>
> >> When taking an exit for A, I'm not interesting in flushing stuff for B
> >> unless I have a dependency. Thus, buffers would have to be per device
> >> before extending their use.
> >
> > Any mmio exit has to flush everything. For example a DMA caused by an
> > e1000 write has to see any writes to the framebuffer, in case the guest
> > is transmitting its framebuffer to the outside world.
>
> We already flush when that crazy guest actually accesses the region, no
> need to do this unconditionally.
>
What if framebuffer is accessed from inside the kernel? Is this case handled?

--
Gleb.

2012-10-22 13:26:14

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 15:08, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote:
>> On 2012-10-22 14:58, Avi Kivity wrote:
>>> On 10/22/2012 02:55 PM, Jan Kiszka wrote:
>>>>> Since the userspace change is needed the idea is dead, but if we could
>>>>> implement it I do not see how it can hurt the latency if it would be the
>>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
>>>>> is empty is cheap and if it is not empty it means that kernel just saved
>>>>> you a lot of 8 bytes exists so even after iterating over all the entries there
>>>>> you still saved a lot of time.
>>>>
>>>> When taking an exit for A, I'm not interesting in flushing stuff for B
>>>> unless I have a dependency. Thus, buffers would have to be per device
>>>> before extending their use.
>>>
>>> Any mmio exit has to flush everything. For example a DMA caused by an
>>> e1000 write has to see any writes to the framebuffer, in case the guest
>>> is transmitting its framebuffer to the outside world.
>>
>> We already flush when that crazy guest actually accesses the region, no
>> need to do this unconditionally.
>>
> What if framebuffer is accessed from inside the kernel? Is this case handled?

Unless I miss a case now, there is no direct access to the framebuffer
possible when we are also doing coalescing. Everything needs to go
through userspace.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 13:58:26

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/19/2012 09:37 AM, Xiao Guangrong wrote:
> After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling),
> the pieces of io data can be collected and write them to the guest memory
> or MMIO together.
>
> Unfortunately, kvm splits the mmio access into 8 bytes and store them to
> vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it
> will cause vcpu->mmio_fragments overflow
>
> The bug can be exposed by isapc (-M isapc):
>
> [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
> [ ......]
> [23154.858083] Call Trace:
> [23154.859874] [<ffffffffa04f0e17>] kvm_get_cr8+0x1d/0x28 [kvm]
> [23154.861677] [<ffffffffa04fa6d4>] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm]
> [23154.863604] [<ffffffffa04f5a1a>] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm]
>
>
> Actually, we can use one mmio_fragment to store a large mmio access for the
> mmio access is always continuous then split it when we pass the mmio-exit-info
> to userspace.

Note, there are instructions that can access discontinuous areas. We don't emulate them and they're unlikely to be used for mmio.

> After that, we only need two entries to store mmio info for
> the cross-mmio pages access

Patch is good, but is somewhat large for 3.7. Maybe we can make it smaller with the following:

>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 8b90dd5..41ceb51 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3779,9 +3779,6 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
> static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
> void *val, int bytes)
> {
> - struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
> -
> - memcpy(vcpu->run->mmio.data, frag->data, frag->len);
> return X86EMUL_CONTINUE;
> }
>
> @@ -3799,6 +3796,64 @@ static const struct read_write_emulator_ops write_emultor = {
> .write = true,
> };
>
> +static bool get_current_mmio_info(struct kvm_vcpu *vcpu, gpa_t *gpa,
> + unsigned *len, void **data)
> +{
> + struct kvm_mmio_fragment *frag;
> + int cur = vcpu->mmio_cur_fragment;
> +
> + if (cur >= vcpu->mmio_nr_fragments)
> + return false;
> +
> + frag = &vcpu->mmio_fragments[cur];
> + if (frag->pos >= frag->len) {
> + if (++vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments)
> + return false;
> + frag++;
> + }

Instead of having ->pos, just adjust ->gpa, ->data, and ->len in place. Then get_current_mmio_info would be unneeded, just the min() bit when accessing ->len.

> +
> + *gpa = frag->gpa + frag->pos;
> + *data = frag->data + frag->pos;
> + *len = min(8u, frag->len - frag->pos);
> + return true;
> +}
> +
> +static void complete_current_mmio(struct kvm_vcpu *vcpu)
> +{
> + struct kvm_mmio_fragment *frag;
> + gpa_t gpa;
> + unsigned len;
> + void *data;
> +
> + get_current_mmio_info(vcpu, &gpa, &len, &data);
> +
> + if (!vcpu->mmio_is_write)
> + memcpy(data, vcpu->run->mmio.data, len);
> +
> + /* Increase frag->pos to switch to the next mmio. */
> + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
> + frag->pos += len;
> +}
> +


And this would be unneeded, just adjust the code that does mmio_cur_fragment++:

static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- struct kvm_mmio_fragment *frag;
+ struct kvm_mmio_fragment frag;

BUG_ON(!vcpu->mmio_needed);

/* Complete previous fragment */
- frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+ frag = vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ if (frag.len <= 8) {
+ ++vcpu->mmio_cur_fragment;
+ } else {
+ vcpu->mmio_fragments[vcpu->mmio_cur_fragment].len -= frag.len;
...





--
error compiling committee.c: too many arguments to function

2012-10-22 14:01:34

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On Mon, Oct 22, 2012 at 03:25:49PM +0200, Jan Kiszka wrote:
> On 2012-10-22 15:08, Gleb Natapov wrote:
> > On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote:
> >> On 2012-10-22 14:58, Avi Kivity wrote:
> >>> On 10/22/2012 02:55 PM, Jan Kiszka wrote:
> >>>>> Since the userspace change is needed the idea is dead, but if we could
> >>>>> implement it I do not see how it can hurt the latency if it would be the
> >>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
> >>>>> is empty is cheap and if it is not empty it means that kernel just saved
> >>>>> you a lot of 8 bytes exists so even after iterating over all the entries there
> >>>>> you still saved a lot of time.
> >>>>
> >>>> When taking an exit for A, I'm not interesting in flushing stuff for B
> >>>> unless I have a dependency. Thus, buffers would have to be per device
> >>>> before extending their use.
> >>>
> >>> Any mmio exit has to flush everything. For example a DMA caused by an
> >>> e1000 write has to see any writes to the framebuffer, in case the guest
> >>> is transmitting its framebuffer to the outside world.
> >>
> >> We already flush when that crazy guest actually accesses the region, no
> >> need to do this unconditionally.
> >>
> > What if framebuffer is accessed from inside the kernel? Is this case handled?
>
> Unless I miss a case now, there is no direct access to the framebuffer
> possible when we are also doing coalescing. Everything needs to go
> through userspace.
>
Yes, with frame buffer is seems to be the case. One can imagine ROMD
device that is MMIO on write but still can be accessed for read from
kernel, but it cannot be coalesced even if coalesced buffer is flushed
on every exit.

--
Gleb.

2012-10-22 14:23:32

by Jan Kiszka

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 2012-10-22 16:00, Gleb Natapov wrote:
> On Mon, Oct 22, 2012 at 03:25:49PM +0200, Jan Kiszka wrote:
>> On 2012-10-22 15:08, Gleb Natapov wrote:
>>> On Mon, Oct 22, 2012 at 03:05:58PM +0200, Jan Kiszka wrote:
>>>> On 2012-10-22 14:58, Avi Kivity wrote:
>>>>> On 10/22/2012 02:55 PM, Jan Kiszka wrote:
>>>>>>> Since the userspace change is needed the idea is dead, but if we could
>>>>>>> implement it I do not see how it can hurt the latency if it would be the
>>>>>>> only mechanism to use coalesced mmio buffer. Checking that the ring buffer
>>>>>>> is empty is cheap and if it is not empty it means that kernel just saved
>>>>>>> you a lot of 8 bytes exists so even after iterating over all the entries there
>>>>>>> you still saved a lot of time.
>>>>>>
>>>>>> When taking an exit for A, I'm not interesting in flushing stuff for B
>>>>>> unless I have a dependency. Thus, buffers would have to be per device
>>>>>> before extending their use.
>>>>>
>>>>> Any mmio exit has to flush everything. For example a DMA caused by an
>>>>> e1000 write has to see any writes to the framebuffer, in case the guest
>>>>> is transmitting its framebuffer to the outside world.
>>>>
>>>> We already flush when that crazy guest actually accesses the region, no
>>>> need to do this unconditionally.
>>>>
>>> What if framebuffer is accessed from inside the kernel? Is this case handled?
>>
>> Unless I miss a case now, there is no direct access to the framebuffer
>> possible when we are also doing coalescing. Everything needs to go
>> through userspace.
>>
> Yes, with frame buffer is seems to be the case. One can imagine ROMD
> device that is MMIO on write but still can be accessed for read from
> kernel, but it cannot be coalesced even if coalesced buffer is flushed
> on every exit.

Usually, a ROMD device has a stable content as long as it is fast
read/slow write. Once it switches mode, it is slow read as well.

Jan

--
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

2012-10-22 15:36:49

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH] KVM: x86: fix vcpu->mmio_fragments overflow

On 10/22/2012 04:00 PM, Gleb Natapov wrote:
> Yes, with frame buffer is seems to be the case. One can imagine ROMD
> device that is MMIO on write but still can be accessed for read from
> kernel, but it cannot be coalesced even if coalesced buffer is flushed
> on every exit.

You cannot enable coalescing on such a device.


--
error compiling committee.c: too many arguments to function

2012-11-01 00:13:35

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH] emulator test: add "rep ins" mmio access test

On Fri, Oct 19, 2012 at 03:39:08PM +0800, Xiao Guangrong wrote:
> Add the test to trigger the bug that "rep ins" causes vcpu->mmio_fragments
> overflow overflow while move large data from ioport to MMIO
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> x86/emulator.c | 14 ++++++++++++++
> 1 files changed, 14 insertions(+), 0 deletions(-)

Applied, thanks.