2016-03-25 13:29:01

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/4] KVM: MMU: fix permission_fault()

kvm-unit-tests complained about the PFEC is not set properly, e.g,:
test pte.rw pte.d pte.nx pde.p pde.rw pde.pse user fetch: FAIL: error code 15
expected 5
Dump mapping: address: 0x123400000000
------L4: 3e95007
------L3: 3e96007
------L2: 2000083

It's caused by the reason that PFEC returned to guest is copied from the
PFEC triggered by shadow page table

This patch fixes it and makes the logic of updating errcode more clean

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.h | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b70df72..81bffd1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -154,7 +154,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pfec)
{
int cpl = kvm_x86_ops->get_cpl(vcpu);
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ unsigned long errcode, rflags = kvm_x86_ops->get_rflags(vcpu);

/*
* If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
@@ -175,7 +175,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool fault = (mmu->permissions[index] >> pte_access) & 1;

WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
- pfec |= PFERR_PRESENT_MASK;
+ errcode = PFERR_PRESENT_MASK;

if (unlikely(mmu->pkru_mask)) {
u32 pkru_bits, offset;
@@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));

pkru_bits &= mmu->pkru_mask >> offset;
- pfec |= -pkru_bits & PFERR_PK_MASK;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
fault |= (pkru_bits != 0);
}

- return -(uint32_t)fault & pfec;
+ return -(uint32_t)fault & errcode;
}

void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1d971c7..bc019f7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -360,7 +360,7 @@ retry_walk:
goto error;

if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
- errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}

--
1.8.3.1


2016-03-25 13:29:06

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 4/4] KVM: MMU: skip obsolete sp in for_each_gfn_*()

The obsolete sp should not be used on current vCPUs and should not hurt
vCPU's running, so skip it from for_each_gfn_sp() and
for_each_gfn_indirect_valid_sp()

The side effort is we will double check role.invalid in kvm_mmu_get_page()
but i think it is okay as role is well cached

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c396e8b..4d66a9e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1906,18 +1906,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
* since it has been deleted from active_mmu_pages but still can be found
* at hast list.
*
- * for_each_gfn_indirect_valid_sp has skipped that kind of page and
- * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
- * all the obsolete pages.
+ * for_each_gfn_valid_sp() has skipped that kind of pages.
*/
-#define for_each_gfn_sp(_kvm, _sp, _gfn) \
+#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->gfn != (_gfn)) {} else
+ if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
+ || (_sp)->role.invalid) {} else

#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_gfn_sp(_kvm, _sp, _gfn) \
- if ((_sp)->role.direct || (_sp)->role.invalid) {} else
+ for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->role.direct) {} else

/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -1958,6 +1957,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif

+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2092,11 +2096,6 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
}

-static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -2123,10 +2122,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_gfn_sp(vcpu->kvm, sp, gfn) {
- if (is_obsolete_sp(vcpu->kvm, sp))
- continue;
-
+ for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
if (!need_sync && sp->unsync)
need_sync = true;

--
1.8.3.1

2016-03-25 13:29:04

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path

Currently only PT64_ROOT_LEVEL - 1 levels are used, one additional entry
in .parent[] is used as a sentinel, the additional entry in .idx[] is
purely wasted

This patch reduces its size and sets the sentinel on the upper level of
the place where we start from

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 32 ++++++++++++--------------------
1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e273144..c396e8b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1984,12 +1984,12 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
}

struct mmu_page_path {
- struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
- unsigned int idx[PT64_ROOT_LEVEL];
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL - 1];
+ unsigned int idx[PT64_ROOT_LEVEL - 1];
};

#define for_each_sp(pvec, sp, parents, i) \
- for (i = mmu_pages_first(&pvec, &parents); \
+ for (i = mmu_pages_next(&pvec, &parents, -1); \
i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
i = mmu_pages_next(&pvec, &parents, i))

@@ -2016,25 +2016,15 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
return n;
}

-static int mmu_pages_first(struct kvm_mmu_pages *pvec,
- struct mmu_page_path *parents)
+static void
+mmu_pages_init(struct mmu_page_path *parents, struct kvm_mmu_page *parent)
{
- struct kvm_mmu_page *sp;
- int level;
-
- if (pvec->nr == 0)
- return 0;
-
- sp = pvec->page[0].sp;
- level = sp->role.level;
- WARN_ON(level == PT_PAGE_TABLE_LEVEL);
-
/*
- * Also set up a sentinel. Further entries in pvec are all
- * children of sp, so this element is never overwritten.
+ * set up a sentinel. Further entries in pvec are all children of
+ * sp, so this element is never overwritten.
*/
- parents->parent[level - 1] = NULL;
- return mmu_pages_next(pvec, parents, -1);
+ if (parent->role.level < PT64_ROOT_LEVEL)
+ parents->parent[parent->role.level - 1] = NULL;
}

static void mmu_pages_clear_parents(struct mmu_page_path *parents)
@@ -2051,7 +2041,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
WARN_ON(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
- } while (!sp->unsync_children);
+ } while (!sp->unsync_children && (level < PT64_ROOT_LEVEL - 1));
}

static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2064,6 +2054,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
LIST_HEAD(invalid_list);
bool flush = false;

+ mmu_pages_init(&parents, parent);
while (mmu_unsync_walk(parent, &pages)) {
bool protected = false;

@@ -2335,6 +2326,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;

+ mmu_pages_init(&parents, parent);
while (mmu_unsync_walk(parent, &pages)) {
struct kvm_mmu_page *sp;

--
1.8.3.1

2016-03-25 13:30:01

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/4] KVM: MMU: simplify the logic of __mmu_unsync_walk()

Each time i looked into the logic of walking unsync shadow pages, it costs
lots of time to understand what it is doing. The trick of this logic is
that the item, sp and idx, saved to kvm_mmu_pages is the sp and the index
in the _parent_ level and it lacks any comment to explain this fact

This patch simplifies it by saving the sp and its index to kvm_mmu_pages,
then it is much easier to understand the operations on the its index

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 40 ++++++++++++++++++++++------------------
1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6bdfbc2..e273144 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1830,6 +1830,8 @@ static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
__clear_bit(idx, sp->unsync_child_bitmap);
}

+#define INVALID_INDEX (-1)
+
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1846,10 +1848,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,

child = page_header(ent & PT64_BASE_ADDR_MASK);

- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
+ if (mmu_pages_add(pvec, sp, i))
+ return -ENOSPC;

+ if (child->unsync_children) {
ret = __mmu_unsync_walk(child, pvec);
if (!ret) {
clear_unsync_child_bit(sp, i);
@@ -1860,7 +1862,13 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return ret;
} else if (child->unsync) {
nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
+
+ /*
+ * the unsync is on the last level so its 'idx' is
+ * useless, we set it to INVALID_INDEX to catch
+ * potential bugs.
+ */
+ if (mmu_pages_add(pvec, child, INVALID_INDEX))
return -ENOSPC;
} else
clear_unsync_child_bit(sp, i);
@@ -1869,8 +1877,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return nr_unsync_leaf;
}

-#define INVALID_INDEX (-1)
-
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1878,7 +1884,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
if (!sp->unsync_children)
return 0;

- mmu_pages_add(pvec, sp, INVALID_INDEX);
return __mmu_unsync_walk(sp, pvec);
}

@@ -1994,16 +1999,18 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
{
int n;

- for (n = i+1; n < pvec->nr; n++) {
+ for (n = i + 1; n < pvec->nr; n++) {
struct kvm_mmu_page *sp = pvec->page[n].sp;
unsigned idx = pvec->page[n].idx;
int level = sp->role.level;

- parents->idx[level-1] = idx;
- if (level == PT_PAGE_TABLE_LEVEL)
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ WARN_ON(idx != INVALID_INDEX);
break;
+ }

- parents->parent[level-2] = sp;
+ parents->idx[level - 2] = idx;
+ parents->parent[level - 2] = sp;
}

return n;
@@ -2018,19 +2025,16 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
if (pvec->nr == 0)
return 0;

- WARN_ON(pvec->page[0].idx != INVALID_INDEX);
-
sp = pvec->page[0].sp;
level = sp->role.level;
WARN_ON(level == PT_PAGE_TABLE_LEVEL);

- parents->parent[level-2] = sp;
-
- /* Also set up a sentinel. Further entries in pvec are all
+ /*
+ * Also set up a sentinel. Further entries in pvec are all
* children of sp, so this element is never overwritten.
*/
- parents->parent[level-1] = NULL;
- return mmu_pages_next(pvec, parents, 0);
+ parents->parent[level - 1] = NULL;
+ return mmu_pages_next(pvec, parents, -1);
}

static void mmu_pages_clear_parents(struct mmu_page_path *parents)
--
1.8.3.1

2016-03-25 13:35:11

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 25/03/2016 14:19, Xiao Guangrong wrote:
> kvm-unit-tests complained about the PFEC is not set properly, e.g,:
> test pte.rw pte.d pte.nx pde.p pde.rw pde.pse user fetch: FAIL: error code 15
> expected 5
> Dump mapping: address: 0x123400000000
> ------L4: 3e95007
> ------L3: 3e96007
> ------L2: 2000083

What's the command line for the reproducer?

> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index b70df72..81bffd1 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -154,7 +154,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> unsigned pfec)
> {
> int cpl = kvm_x86_ops->get_cpl(vcpu);
> - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
> + unsigned long errcode, rflags = kvm_x86_ops->get_rflags(vcpu);
>
> /*
> * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
> @@ -175,7 +175,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> bool fault = (mmu->permissions[index] >> pte_access) & 1;
>
> WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
> - pfec |= PFERR_PRESENT_MASK;
> + errcode = PFERR_PRESENT_MASK;

So is this patch doing the same as "KVM: MMU: precompute page fault
error code"? It was necessary after all. :)

Paolo

>
> if (unlikely(mmu->pkru_mask)) {
> u32 pkru_bits, offset;
> @@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
>
> pkru_bits &= mmu->pkru_mask >> offset;
> - pfec |= -pkru_bits & PFERR_PK_MASK;
> + errcode |= -pkru_bits & PFERR_PK_MASK;
> fault |= (pkru_bits != 0);
> }
>
> - return -(uint32_t)fault & pfec;
> + return -(uint32_t)fault & errcode;
> }
>
> void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 1d971c7..bc019f7 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -360,7 +360,7 @@ retry_walk:
> goto error;
>
> if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
> - errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
> + errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
> goto error;
> }
>

2016-03-25 13:43:09

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 03/25/2016 09:35 PM, Paolo Bonzini wrote:
>
>
> On 25/03/2016 14:19, Xiao Guangrong wrote:
>> kvm-unit-tests complained about the PFEC is not set properly, e.g,:
>> test pte.rw pte.d pte.nx pde.p pde.rw pde.pse user fetch: FAIL: error code 15
>> expected 5
>> Dump mapping: address: 0x123400000000
>> ------L4: 3e95007
>> ------L3: 3e96007
>> ------L2: 2000083
>
> What's the command line for the reproducer?

QEMU=/home/eric/qemu/x86_64-softmmu/qemu-system-x86_64 ./x86-run x86/access.flat

>
>> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
>> index b70df72..81bffd1 100644
>> --- a/arch/x86/kvm/mmu.h
>> +++ b/arch/x86/kvm/mmu.h
>> @@ -154,7 +154,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>> unsigned pfec)
>> {
>> int cpl = kvm_x86_ops->get_cpl(vcpu);
>> - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
>> + unsigned long errcode, rflags = kvm_x86_ops->get_rflags(vcpu);
>>
>> /*
>> * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
>> @@ -175,7 +175,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>> bool fault = (mmu->permissions[index] >> pte_access) & 1;
>>
>> WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
>> - pfec |= PFERR_PRESENT_MASK;
>> + errcode = PFERR_PRESENT_MASK;
>
> So is this patch doing the same as "KVM: MMU: precompute page fault
> error code"? It was necessary after all. :)

Sorry for my mistake... I missed the logic you changed :(

I still prefer to calculating the error code on the fault path which is rare, or
think a way to encapsulate it to permission_fault()...

2016-03-25 13:45:46

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path



On 25/03/2016 14:19, Xiao Guangrong wrote:
> Currently only PT64_ROOT_LEVEL - 1 levels are used, one additional entry
> in .parent[] is used as a sentinel, the additional entry in .idx[] is
> purely wasted
>
> This patch reduces its size and sets the sentinel on the upper level of
> the place where we start from

This patch and the previous one are basically redoing commit
0a47cd85833e ("KVM: MMU: Fix ubsan warnings", 2016-03-04). While you
find your version easier to understand, I of course find mine easier.

Rather than getting stuck in a ko fight, the solution is to stick with
the code in KVM and add comments. I'll give it a try...

Paolo

2016-03-25 13:49:47

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path



On 03/25/2016 09:45 PM, Paolo Bonzini wrote:
>
>
> On 25/03/2016 14:19, Xiao Guangrong wrote:
>> Currently only PT64_ROOT_LEVEL - 1 levels are used, one additional entry
>> in .parent[] is used as a sentinel, the additional entry in .idx[] is
>> purely wasted
>>
>> This patch reduces its size and sets the sentinel on the upper level of
>> the place where we start from
>
> This patch and the previous one are basically redoing commit
> 0a47cd85833e ("KVM: MMU: Fix ubsan warnings", 2016-03-04). While you
> find your version easier to understand, I of course find mine easier.
>
> Rather than getting stuck in a ko fight, the solution is to stick with
> the code in KVM and add comments. I'll give it a try...

If you do not like this one, we can just make the .index is
[PT64_ROOT_LEVEL - 1] and keep the sentinel in .parents[], that little
change and nice code shape.

2016-03-25 13:50:08

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 25/03/2016 14:41, Xiao Guangrong wrote:
>>>
>>
>> So is this patch doing the same as "KVM: MMU: precompute page fault
>> error code"? It was necessary after all. :)
>
> Sorry for my mistake... I missed the logic you changed :(
>
> I still prefer to calculating the error code on the fault path which is
> rare, or think a way to encapsulate it to permission_fault()...

Yes, I will apply your patch.

Paolo

2016-03-25 13:56:46

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path



On 25/03/2016 14:48, Xiao Guangrong wrote:
>>>
>>
>> This patch and the previous one are basically redoing commit
>> 0a47cd85833e ("KVM: MMU: Fix ubsan warnings", 2016-03-04). While you
>> find your version easier to understand, I of course find mine easier.
>>
>> Rather than getting stuck in a ko fight, the solution is to stick with
>> the code in KVM and add comments. I'll give it a try...
>
> If you do not like this one, we can just make the .index is
> [PT64_ROOT_LEVEL - 1] and keep the sentinel in .parents[], that little
> change and nice code shape.

I suppose you'd have something like this then:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 70e95d097ef1..15e1735a2e3a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1980,7 +1980,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,

struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
- unsigned int idx[PT64_ROOT_LEVEL];
+ unsigned int idx[PT64_ROOT_LEVEL-1];
};

#define for_each_sp(pvec, sp, parents, i) \
@@ -2037,13 +2037,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
struct kvm_mmu_page *sp;
unsigned int level = 0;
+ unsigned int idx;

do {
- unsigned int idx = parents->idx[level];
sp = parents->parent[level];
- if (!sp)
+ if (!sp || WARN_ON(level == PT64_ROOT_LEVEL-1))
return;

+ idx = parents->idx[level];
WARN_ON(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;

By making the arrays the same size, the effect of the sentinel seems
clearer to me. It doesn't seem worth 4 bytes (and strictly speaking
those 4 bytes would be there anyway due to padding)...

Paolo

2016-03-25 14:08:10

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path



On 03/25/2016 09:56 PM, Paolo Bonzini wrote:
>
>
> On 25/03/2016 14:48, Xiao Guangrong wrote:
>>>>
>>>
>>> This patch and the previous one are basically redoing commit
>>> 0a47cd85833e ("KVM: MMU: Fix ubsan warnings", 2016-03-04). While you
>>> find your version easier to understand, I of course find mine easier.
>>>
>>> Rather than getting stuck in a ko fight, the solution is to stick with
>>> the code in KVM and add comments. I'll give it a try...
>>
>> If you do not like this one, we can just make the .index is
>> [PT64_ROOT_LEVEL - 1] and keep the sentinel in .parents[], that little
>> change and nice code shape.
>
> I suppose you'd have something like this then:
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 70e95d097ef1..15e1735a2e3a 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1980,7 +1980,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
>
> struct mmu_page_path {
> struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
> - unsigned int idx[PT64_ROOT_LEVEL];
> + unsigned int idx[PT64_ROOT_LEVEL-1];
> };
>
> #define for_each_sp(pvec, sp, parents, i) \
> @@ -2037,13 +2037,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
> {
> struct kvm_mmu_page *sp;
> unsigned int level = 0;
> + unsigned int idx;
>
> do {
> - unsigned int idx = parents->idx[level];
> sp = parents->parent[level];
> - if (!sp)
> + if (!sp || WARN_ON(level == PT64_ROOT_LEVEL-1))
> return;
>
> + idx = parents->idx[level];
> WARN_ON(idx == INVALID_INDEX);
> clear_unsync_child_bit(sp, idx);
> level++;
>

Yes, exactly.

[ actually, we can keep mmu_pages_clear_parents() unchanged ]

> By making the arrays the same size, the effect of the sentinel seems
> clearer to me. It doesn't seem worth 4 bytes (and strictly speaking
> those 4 bytes would be there anyway due to padding)...

The sentinel is NULL forever so it can not go to the inner loop anyway...

Okay, i am not strong opinion on it, it is not a big deal. Let's
happily drop it if you really dislike it. :)

2016-03-25 14:22:03

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 25/03/2016 14:19, Xiao Guangrong wrote:
> WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
> - pfec |= PFERR_PRESENT_MASK;
> + errcode = PFERR_PRESENT_MASK;
>
> if (unlikely(mmu->pkru_mask)) {
> u32 pkru_bits, offset;
> @@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
>
> pkru_bits &= mmu->pkru_mask >> offset;
> - pfec |= -pkru_bits & PFERR_PK_MASK;
> + errcode |= -pkru_bits & PFERR_PK_MASK;
> fault |= (pkru_bits != 0);
> }
>
> - return -(uint32_t)fault & pfec;
> + return -(uint32_t)fault & errcode;
> }

I have another doubt here.

If you get a fault due to U=0, you would not get PFERR_PK_MASK. This
is checked implicitly through the pte_user bit which we moved to
PFERR_RSVD_BIT. However, if you get a fault due to W=0 _and_
PKRU.AD=1 or PKRU.WD=1 for the page's protection key, would the PK
bit be set in the error code? If not, we would need something like
this:

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 81bffd1524c4..6835a551a5c4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -172,12 +172,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
int index = (pfec >> 1) +
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
- bool fault = (mmu->permissions[index] >> pte_access) & 1;

WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
- errcode = PFERR_PRESENT_MASK;
+ errcode = (mmu->permissions[index] >> pte_access) & PFERR_PRESENT_MASK;

- if (unlikely(mmu->pkru_mask)) {
+ if (unlikely(-errcode & mmu->pkru_mask)) {
u32 pkru_bits, offset;

/*
@@ -188,11 +187,10 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));

pkru_bits &= mmu->pkru_mask >> offset;
- errcode |= -pkru_bits & PFERR_PK_MASK;
- fault |= (pkru_bits != 0);
+ errcode |= pkru_bits ? PFERR_PK_MASK | PFERR_PRESENT_MASK : 0;
}

- return -(uint32_t)fault & errcode;
+ return errcode;
}

void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);


Thanks,

Paolo

2016-03-25 14:22:48

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 3/4] KVM: MMU: reduce the size of mmu_page_path



On 25/03/2016 15:07, Xiao Guangrong wrote:
>>
>> @@ -2037,13 +2037,14 @@ static void mmu_pages_clear_parents(struct
>> mmu_page_path *parents)
>> {
>> struct kvm_mmu_page *sp;
>> unsigned int level = 0;
>> + unsigned int idx;
>>
>> do {
>> - unsigned int idx = parents->idx[level];
>> sp = parents->parent[level];
>> - if (!sp)
>> + if (!sp || WARN_ON(level == PT64_ROOT_LEVEL-1))
>> return;
>>
>> + idx = parents->idx[level];
>> WARN_ON(idx == INVALID_INDEX);
>> clear_unsync_child_bit(sp, idx);
>> level++;
>>
>
> Yes, exactly.
>
> [ actually, we can keep mmu_pages_clear_parents() unchanged ]

You cannot because ubsan would complain. :)

Paolo

2016-03-29 09:44:46

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 4/4] KVM: MMU: skip obsolete sp in for_each_gfn_*()

On 25/03/2016 14:19, Xiao Guangrong wrote:
> The obsolete sp should not be used on current vCPUs and should not hurt
> vCPU's running, so skip it from for_each_gfn_sp() and
> for_each_gfn_indirect_valid_sp()
>
> The side effort is we will double check role.invalid in kvm_mmu_get_page()
> but i think it is okay as role is well cached
>
> Signed-off-by: Xiao Guangrong <[email protected]>

Queued for 4.7.

Paolo

> ---
> arch/x86/kvm/mmu.c | 28 ++++++++++++----------------
> 1 file changed, 12 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index c396e8b..4d66a9e 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1906,18 +1906,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> * since it has been deleted from active_mmu_pages but still can be found
> * at hast list.
> *
> - * for_each_gfn_indirect_valid_sp has skipped that kind of page and
> - * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
> - * all the obsolete pages.
> + * for_each_gfn_valid_sp() has skipped that kind of pages.
> */
> -#define for_each_gfn_sp(_kvm, _sp, _gfn) \
> +#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
> hlist_for_each_entry(_sp, \
> &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
> - if ((_sp)->gfn != (_gfn)) {} else
> + if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
> + || (_sp)->role.invalid) {} else
>
> #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
> - for_each_gfn_sp(_kvm, _sp, _gfn) \
> - if ((_sp)->role.direct || (_sp)->role.invalid) {} else
> + for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
> + if ((_sp)->role.direct) {} else
>
> /* @sp->gfn should be write-protected at the call site */
> static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> @@ -1958,6 +1957,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
> static void mmu_audit_disable(void) { }
> #endif
>
> +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +{
> + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> +}
> +
> static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> struct list_head *invalid_list)
> {
> @@ -2092,11 +2096,6 @@ static void clear_sp_write_flooding_count(u64 *spte)
> __clear_sp_write_flooding_count(sp);
> }
>
> -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> -{
> - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
> -}
> -
> static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> gfn_t gfn,
> gva_t gaddr,
> @@ -2123,10 +2122,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
> quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
> role.quadrant = quadrant;
> }
> - for_each_gfn_sp(vcpu->kvm, sp, gfn) {
> - if (is_obsolete_sp(vcpu->kvm, sp))
> - continue;
> -
> + for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
> if (!need_sync && sp->unsync)
> need_sync = true;
>
>

2016-03-29 17:44:45

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 03/25/2016 10:21 PM, Paolo Bonzini wrote:
>
>
> On 25/03/2016 14:19, Xiao Guangrong wrote:
>> WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
>> - pfec |= PFERR_PRESENT_MASK;
>> + errcode = PFERR_PRESENT_MASK;
>>
>> if (unlikely(mmu->pkru_mask)) {
>> u32 pkru_bits, offset;
>> @@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>> ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
>>
>> pkru_bits &= mmu->pkru_mask >> offset;
>> - pfec |= -pkru_bits & PFERR_PK_MASK;
>> + errcode |= -pkru_bits & PFERR_PK_MASK;
>> fault |= (pkru_bits != 0);
>> }
>>
>> - return -(uint32_t)fault & pfec;
>> + return -(uint32_t)fault & errcode;
>> }
>
> I have another doubt here.
>
> If you get a fault due to U=0, you would not get PFERR_PK_MASK. This
> is checked implicitly through the pte_user bit which we moved to
> PFERR_RSVD_BIT. However, if you get a fault due to W=0 _and_
> PKRU.AD=1 or PKRU.WD=1 for the page's protection key, would the PK
> bit be set in the error code? If not, we would need something like
> this:

Based on the SDM:
PK flag (bit 5).
This flag is 1 if (1) IA32_EFER.LMA = CR4.PKE = 1; (2) the access causing the page-fault exception
was a data access; (3) the linear address was a user-mode address with protection key i; and (5) the
PKRU register (see Section 4.6.2) is such that either (a) ADi = 1; or (b) the following all hold:
(i) WDi = 1; (ii) the access is a write access; and (iii) either CR0.WP = 1 or the access causing
the page-fault exception was a user-mode access.

So I think PKEY check and ordinary check are independent, i.e, PFEC.PKEY may be
set even if the on permission on the page table is not adequate.

2016-03-29 20:12:54

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 29/03/2016 19:43, Xiao Guangrong wrote:
> Based on the SDM:
> PK flag (bit 5).
> This flag is 1 if (1) IA32_EFER.LMA = CR4.PKE = 1; (2) the access
> causing the page-fault exception was a data access; (3) the linear
> address was a user-mode address with protection key i; and (5) the PKRU
> register (see Section 4.6.2) is such that either (a) ADi = 1; or (b) the
> following all hold: (i) WDi = 1; (ii) the access is a write access; and
> (iii) either CR0.WP = 1 or the access causing the page-fault exception
> was a user-mode access.
>
> So I think PKEY check and ordinary check are independent, i.e, PFEC.PKEY
> may be set even if the on permission on the page table is not adequate.

x86/access.flat is currently using the "other" definition, i.e., PFEC.PK
is only set if W=1 or CR0.WP=0 && PFEC.U=0 or PFEC.W=0. Can you use it
(with ept=1 of course) to check what the processor is doing?

Paolo

2016-03-30 01:57:37

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 03/30/2016 04:09 AM, Paolo Bonzini wrote:
>
>
> On 29/03/2016 19:43, Xiao Guangrong wrote:
>> Based on the SDM:
>> PK flag (bit 5).
>> This flag is 1 if (1) IA32_EFER.LMA = CR4.PKE = 1; (2) the access
>> causing the page-fault exception was a data access; (3) the linear
>> address was a user-mode address with protection key i; and (5) the PKRU
>> register (see Section 4.6.2) is such that either (a) ADi = 1; or (b) the
>> following all hold: (i) WDi = 1; (ii) the access is a write access; and
>> (iii) either CR0.WP = 1 or the access causing the page-fault exception
>> was a user-mode access.
>>
>> So I think PKEY check and ordinary check are independent, i.e, PFEC.PKEY
>> may be set even if the on permission on the page table is not adequate.
>
> x86/access.flat is currently using the "other" definition, i.e., PFEC.PK
> is only set if W=1 or CR0.WP=0 && PFEC.U=0 or PFEC.W=0. Can you use it
> (with ept=1 of course) to check what the processor is doing?
>

Sure.

And ept=1 is hard to trigger MMU issue, i am enabling PKEY on shadow
MMU, let's see what will happen. ;)

2016-03-30 06:36:22

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 30/03/2016 03:56, Xiao Guangrong wrote:
>> x86/access.flat is currently using the "other" definition, i.e., PFEC.PK
>> is only set if W=1 or CR0.WP=0 && PFEC.U=0 or PFEC.W=0. Can you use it
>> (with ept=1 of course) to check what the processor is doing?
>
> Sure.
>
> And ept=1 is hard to trigger MMU issue, i am enabling PKEY on shadow
> MMU, let's see what will happen. ;)

No, don't do that!

ept=1 lets you test what the processor does. It means you cannot test
permission_fault(), but what we want here is just reverse engineering
the microcode. ept=1 lets you do exactly that.

Paolo

2016-03-30 06:40:07

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 03/30/2016 02:36 PM, Paolo Bonzini wrote:
>
>
> On 30/03/2016 03:56, Xiao Guangrong wrote:
>>> x86/access.flat is currently using the "other" definition, i.e., PFEC.PK
>>> is only set if W=1 or CR0.WP=0 && PFEC.U=0 or PFEC.W=0. Can you use it
>>> (with ept=1 of course) to check what the processor is doing?
>>
>> Sure.
>>
>> And ept=1 is hard to trigger MMU issue, i am enabling PKEY on shadow
>> MMU, let's see what will happen. ;)
>
> No, don't do that!
>
> ept=1 lets you test what the processor does. It means you cannot test
> permission_fault(), but what we want here is just reverse engineering
> the microcode. ept=1 lets you do exactly that.

Yes, i got this point. Huaitong will do the test once the machine gets
free.

2016-04-06 03:29:07

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 03/30/2016 02:39 PM, Xiao Guangrong wrote:
>
>
> On 03/30/2016 02:36 PM, Paolo Bonzini wrote:
>>
>>
>> On 30/03/2016 03:56, Xiao Guangrong wrote:
>>>> x86/access.flat is currently using the "other" definition, i.e., PFEC.PK
>>>> is only set if W=1 or CR0.WP=0 && PFEC.U=0 or PFEC.W=0. Can you use it
>>>> (with ept=1 of course) to check what the processor is doing?
>>>
>>> Sure.
>>>
>>> And ept=1 is hard to trigger MMU issue, i am enabling PKEY on shadow
>>> MMU, let's see what will happen. ;)
>>
>> No, don't do that!
>>
>> ept=1 lets you test what the processor does. It means you cannot test
>> permission_fault(), but what we want here is just reverse engineering
>> the microcode. ept=1 lets you do exactly that.
>
> Yes, i got this point. Huaitong will do the test once the machine gets
> free.

I tested it and it is failed:

test pte.p pte.user pde.p pde.user pde.a pde.pse pkru.wd pkey=1 user write efer.nx cr4.pke: FAIL:
error code 27 expected 7
Dump mapping: address: 0x123400000000
------L4: 2ebe007
------L3: 2ebf007
------L2: 8000000020000a5

So PFEC.PKEY is set even if the ordinary check failed (caused by pde.rw = 0), the kvm code is
right. :)

2016-04-06 08:17:47

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 06/04/2016 05:27, Xiao Guangrong wrote:
>
> I tested it and it is failed:
>
> test pte.p pte.user pde.p pde.user pde.a pde.pse pkru.wd pkey=1 user
> write efer.nx cr4.pke: FAIL: error code 27 expected 7
> Dump mapping: address: 0x123400000000
> ------L4: 2ebe007
> ------L3: 2ebf007
> ------L2: 8000000020000a5
>
> So PFEC.PKEY is set even if the ordinary check failed (caused by pde.rw
> = 0), the kvm code is
> right. :)

Cool, thanks very much. I'll fix both QEMU and kvm-unit-tests.

Paolo

2016-04-06 08:56:55

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 25/03/2016 14:19, Xiao Guangrong wrote:
> @@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
> ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));

One more tweak is needed in the line above; pfec - 1 must become pfec &
~1, because you've removed the

pfec |= PFERR_PRESENT_MASK;

line. Applied with this change.

Paolo

>
> pkru_bits &= mmu->pkru_mask >> offset;
> - pfec |= -pkru_bits & PFERR_PK_MASK;
> + errcode |= -pkru_bits & PFERR_PK_MASK;
> fault |= (pkru_bits != 0);
> }
>
> - return -(uint32_t)fault & pfec;
> + return -(uint32_t)fault & errcode;
> }
>
> void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);

2016-04-06 15:11:53

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: fix permission_fault()



On 04/06/2016 04:56 PM, Paolo Bonzini wrote:
>
>
> On 25/03/2016 14:19, Xiao Guangrong wrote:
>> @@ -193,11 +193,11 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
>> ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
>
> One more tweak is needed in the line above; pfec - 1 must become pfec &
> ~1, because you've removed the
>
> pfec |= PFERR_PRESENT_MASK;
>
> line. Applied with this change.

Yes, indeed. Thanks for your fix.