2022-12-22 02:38:20

by Vipin Sharma

[permalink] [raw]
Subject: [Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

When dirty log is enabled, huge pages are split. Page table's pages
during the split are allocated based on the current thread NUMA node or
mempolicy. This causes inefficient page table accesses if underlying
page is on a different NUMA node

Allocate page table's pages on the same NUMA node as the underlying huge
page when dirty log is enabled and huge pages are split.

The performance gain during the pre-copy phase of live migrations of a
416 vCPUs and 11 TiB memory VM on a 8 node host was seen in the range
of 130% to 150%.

Suggested-by: David Matlack <[email protected]>
Signed-off-by: Vipin Sharma <[email protected]>
---
arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
include/linux/kvm_host.h | 18 ++++++++++++++++++
2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 4974fa96deff..376b8dceb3f9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
return spte_set;
}

-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
{
struct kvm_mmu_page *sp;

@@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
if (!sp)
return NULL;

- sp->spt = (void *)__get_free_page(gfp);
+ sp->spt = kvm_mmu_get_free_page(nid, gfp);
+
if (!sp->spt) {
kmem_cache_free(mmu_page_header_cache, sp);
return NULL;
@@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
bool shared)
{
struct kvm_mmu_page *sp;
+ int nid;
+
+ nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));

/*
* Since we are allocating while under the MMU lock we have to be
@@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
* If this allocation fails we drop the lock and retry with reclaim
* allowed.
*/
- sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+ sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
if (sp)
return sp;

@@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);

iter->yielded = true;
- sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+ sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);

if (shared)
read_lock(&kvm->mmu_lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d48064503b88..a262e15ebd19 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);

struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
+
+/*
+ * Tells the appropriate NUMA node location of the page table's page based on
+ * pfn it will point to.
+ *
+ * Return the nid of the page if pfn is valid and backed by a refcounted page,
+ * otherwise, return the nearest memory node for the current CPU.
+ */
+static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)
+{
+ struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+ if (page)
+ return page_to_nid(page);
+ else
+ return numa_mem_id();
+}
+
bool kvm_is_zone_device_page(struct page *page);

struct kvm_irq_ack_notifier {
--
2.39.0.314.g84b9a713c41-goog


2022-12-27 19:16:58

by Ben Gardon

[permalink] [raw]
Subject: Re: [Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <[email protected]> wrote:
>
> When dirty log is enabled, huge pages are split. Page table's pages

Nit: Suggest "When huge pages are split for dirty log" since this can
happen at various points during dirty logging.
Same below.

> during the split are allocated based on the current thread NUMA node or
> mempolicy. This causes inefficient page table accesses if underlying
> page is on a different NUMA node
>
> Allocate page table's pages on the same NUMA node as the underlying huge
> page when dirty log is enabled and huge pages are split.
>
> The performance gain during the pre-copy phase of live migrations of a
> 416 vCPUs and 11 TiB memory VM on a 8 node host was seen in the range
> of 130% to 150%.
>
> Suggested-by: David Matlack <[email protected]>
> Signed-off-by: Vipin Sharma <[email protected]>
> ---
> arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
> include/linux/kvm_host.h | 18 ++++++++++++++++++
> 2 files changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 4974fa96deff..376b8dceb3f9 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
> return spte_set;
> }
>
> -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
> {
> struct kvm_mmu_page *sp;
>
> @@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> if (!sp)
> return NULL;
>
> - sp->spt = (void *)__get_free_page(gfp);
> + sp->spt = kvm_mmu_get_free_page(nid, gfp);
> +

Just so that kvm_mmu_get_free_page isn't dead code in the previous
commit, I'd do this refactor there and just pass NUMA_NO_NODE here.

> if (!sp->spt) {
> kmem_cache_free(mmu_page_header_cache, sp);
> return NULL;
> @@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> bool shared)
> {
> struct kvm_mmu_page *sp;
> + int nid;
> +
> + nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));
>
> /*
> * Since we are allocating while under the MMU lock we have to be
> @@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> * If this allocation fails we drop the lock and retry with reclaim
> * allowed.
> */
> - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
> + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
> if (sp)
> return sp;
>
> @@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> write_unlock(&kvm->mmu_lock);
>
> iter->yielded = true;
> - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
> + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);
>
> if (shared)
> read_lock(&kvm->mmu_lock);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index d48064503b88..a262e15ebd19 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
> int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
>
> struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
> +
> +/*
> + * Tells the appropriate NUMA node location of the page table's page based on
> + * pfn it will point to.
> + *
> + * Return the nid of the page if pfn is valid and backed by a refcounted page,
> + * otherwise, return the nearest memory node for the current CPU.

Nit: Should this be "current thread"?

> + */
> +static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)

This could just be kvm_pfn_nid (or even better kvm_pfn_node_id) since
this really has nothing to do with page tables. We just want to know
which NUMA node backs the given PFN.

> +{
> + struct page *page = kvm_pfn_to_refcounted_page(pfn);
> +
> + if (page)
> + return page_to_nid(page);
> + else
> + return numa_mem_id();
> +}
> +
> bool kvm_is_zone_device_page(struct page *page);
>
> struct kvm_irq_ack_notifier {
> --
> 2.39.0.314.g84b9a713c41-goog
>

2022-12-28 22:21:57

by Vipin Sharma

[permalink] [raw]
Subject: Re: [Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

On Tue, Dec 27, 2022 at 11:02 AM Ben Gardon <[email protected]> wrote:
>
> On Wed, Dec 21, 2022 at 6:35 PM Vipin Sharma <[email protected]> wrote:
> >
> > When dirty log is enabled, huge pages are split. Page table's pages
>
> Nit: Suggest "When huge pages are split for dirty log" since this can
> happen at various points during dirty logging.
> Same below.
>

Yeah, this should be updated.

> > during the split are allocated based on the current thread NUMA node or
> > mempolicy. This causes inefficient page table accesses if underlying
> > page is on a different NUMA node
> >
> > Allocate page table's pages on the same NUMA node as the underlying huge
> > page when dirty log is enabled and huge pages are split.
> >
> > The performance gain during the pre-copy phase of live migrations of a
> > 416 vCPUs and 11 TiB memory VM on a 8 node host was seen in the range
> > of 130% to 150%.
> >
> > Suggested-by: David Matlack <[email protected]>
> > Signed-off-by: Vipin Sharma <[email protected]>
> > ---
> > arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
> > include/linux/kvm_host.h | 18 ++++++++++++++++++
> > 2 files changed, 26 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 4974fa96deff..376b8dceb3f9 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
> > return spte_set;
> > }
> >
> > -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> > +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
> > {
> > struct kvm_mmu_page *sp;
> >
> > @@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> > if (!sp)
> > return NULL;
> >
> > - sp->spt = (void *)__get_free_page(gfp);
> > + sp->spt = kvm_mmu_get_free_page(nid, gfp);
> > +
>
> Just so that kvm_mmu_get_free_page isn't dead code in the previous
> commit, I'd do this refactor there and just pass NUMA_NO_NODE here.
>

Agreed.

> > if (!sp->spt) {
> > kmem_cache_free(mmu_page_header_cache, sp);
> > return NULL;
> > @@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > bool shared)
> > {
> > struct kvm_mmu_page *sp;
> > + int nid;
> > +
> > + nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));
> >
> > /*
> > * Since we are allocating while under the MMU lock we have to be
> > @@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > * If this allocation fails we drop the lock and retry with reclaim
> > * allowed.
> > */
> > - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
> > + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
> > if (sp)
> > return sp;
> >
> > @@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > write_unlock(&kvm->mmu_lock);
> >
> > iter->yielded = true;
> > - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
> > + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);
> >
> > if (shared)
> > read_lock(&kvm->mmu_lock);
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index d48064503b88..a262e15ebd19 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
> > int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
> >
> > struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
> > +
> > +/*
> > + * Tells the appropriate NUMA node location of the page table's page based on
> > + * pfn it will point to.
> > + *
> > + * Return the nid of the page if pfn is valid and backed by a refcounted page,
> > + * otherwise, return the nearest memory node for the current CPU.
>
> Nit: Should this be "current thread"?

I will say "current thread CPU". As memory nodes are near to CPUs
whereas threads can execute on multiple CPUs throughout its lifetime.

>
> > + */
> > +static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)
>
> This could just be kvm_pfn_nid (or even better kvm_pfn_node_id) since
> this really has nothing to do with page tables. We just want to know
> which NUMA node backs the given PFN.

Apart from NUMA node backing the given PFN, it can also return the
nearest NUMA node via numa_mem_id(). So, it is actually telling which
NUMA node is the best one for the page table's page, given a PFN.


>
> > +{
> > + struct page *page = kvm_pfn_to_refcounted_page(pfn);
> > +
> > + if (page)
> > + return page_to_nid(page);
> > + else
> > + return numa_mem_id();
> > +}
> > +
> > bool kvm_is_zone_device_page(struct page *page);
> >
> > struct kvm_irq_ack_notifier {
> > --
> > 2.39.0.314.g84b9a713c41-goog
> >

2022-12-29 22:50:49

by David Matlack

[permalink] [raw]
Subject: Re: [Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

On Wed, Dec 21, 2022 at 06:34:53PM -0800, Vipin Sharma wrote:
> When dirty log is enabled, huge pages are split. Page table's pages
> during the split are allocated based on the current thread NUMA node or
> mempolicy. This causes inefficient page table accesses if underlying
> page is on a different NUMA node
>
> Allocate page table's pages on the same NUMA node as the underlying huge
> page when dirty log is enabled and huge pages are split.
>
> The performance gain during the pre-copy phase of live migrations of a
> 416 vCPUs and 11 TiB memory VM on a 8 node host was seen in the range
> of 130% to 150%.

Can you be more specific about this. "The performance" is vague. I know
it's an internal workload and fully explaining it would be difficult,
but you can give readers a slightly more specific idea of what improved.
e.g.

When testing with a synthetic write-heavy workload in a 416 vCPU VM on
an 8 NUMA node host, the throughput increased by 150% from X to Y
operations per second.

It's also necessary to characterize the improvement relative to the
performance when dirty logging is not enabled. Whithout that information
it would be hard for an unfamiliar reader to understand how useful this
change really is.

For example, let's say the throughput of your workload is 100,000
operations per second before dirty logging is enabled, and that drops
down to 1,000 operations per second after dirty logging is enabled. This
commit could increase that by 150% to 2,500 operations per second, but
that's actually not a very meaningful improvement since, either way,
guest performance is degraded by 95+% during dirty logging.

On the other hand, if performance goes from 100,000 to 30,000 normally,
and this commit increases that 30,000 to 75,000 (150%), that's a much
more meaningful improvement.

>
> Suggested-by: David Matlack <[email protected]>
> Signed-off-by: Vipin Sharma <[email protected]>
> ---
> arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
> include/linux/kvm_host.h | 18 ++++++++++++++++++
> 2 files changed, 26 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 4974fa96deff..376b8dceb3f9 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
> return spte_set;
> }
>
> -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
> {
> struct kvm_mmu_page *sp;
>
> @@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> if (!sp)
> return NULL;
>
> - sp->spt = (void *)__get_free_page(gfp);
> + sp->spt = kvm_mmu_get_free_page(nid, gfp);
> +
> if (!sp->spt) {
> kmem_cache_free(mmu_page_header_cache, sp);
> return NULL;
> @@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> bool shared)
> {
> struct kvm_mmu_page *sp;
> + int nid;
> +
> + nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));
>
> /*
> * Since we are allocating while under the MMU lock we have to be
> @@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> * If this allocation fails we drop the lock and retry with reclaim
> * allowed.
> */
> - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
> + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
> if (sp)
> return sp;
>
> @@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> write_unlock(&kvm->mmu_lock);
>
> iter->yielded = true;
> - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
> + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);
>
> if (shared)
> read_lock(&kvm->mmu_lock);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index d48064503b88..a262e15ebd19 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
> int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
>
> struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
> +
> +/*
> + * Tells the appropriate NUMA node location of the page table's page based on
> + * pfn it will point to.

I know what you are trying to say but the wording is a bit awkward. e.g.
"Tells" instead of "Returns", "location" is redundant, "page table's
page", etc. Suggest this:

/*
* Returns an appropriate NUMA node on which to allocate a page table that
* maps @pfn.
*/

> + *
> + * Return the nid of the page if pfn is valid and backed by a refcounted page,
> + * otherwise, return the nearest memory node for the current CPU.

I would just drop this as it's just restating the code, which is already
very readable.

> + */
> +static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)
> +{
> + struct page *page = kvm_pfn_to_refcounted_page(pfn);
> +
> + if (page)
> + return page_to_nid(page);
> + else
> + return numa_mem_id();
> +}
> +
> bool kvm_is_zone_device_page(struct page *page);
>
> struct kvm_irq_ack_notifier {
> --
> 2.39.0.314.g84b9a713c41-goog
>

2023-01-03 18:41:56

by Vipin Sharma

[permalink] [raw]
Subject: Re: [Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

On Thu, Dec 29, 2022 at 2:30 PM David Matlack <[email protected]> wrote:
>
> On Wed, Dec 21, 2022 at 06:34:53PM -0800, Vipin Sharma wrote:
> > When dirty log is enabled, huge pages are split. Page table's pages
> > during the split are allocated based on the current thread NUMA node or
> > mempolicy. This causes inefficient page table accesses if underlying
> > page is on a different NUMA node
> >
> > Allocate page table's pages on the same NUMA node as the underlying huge
> > page when dirty log is enabled and huge pages are split.
> >
> > The performance gain during the pre-copy phase of live migrations of a
> > 416 vCPUs and 11 TiB memory VM on a 8 node host was seen in the range
> > of 130% to 150%.
>
> Can you be more specific about this. "The performance" is vague. I know
> it's an internal workload and fully explaining it would be difficult,
> but you can give readers a slightly more specific idea of what improved.
> e.g.
>
> When testing with a synthetic write-heavy workload in a 416 vCPU VM on
> an 8 NUMA node host, the throughput increased by 150% from X to Y
> operations per second.
>
> It's also necessary to characterize the improvement relative to the
> performance when dirty logging is not enabled. Whithout that information
> it would be hard for an unfamiliar reader to understand how useful this
> change really is.
>
> For example, let's say the throughput of your workload is 100,000
> operations per second before dirty logging is enabled, and that drops
> down to 1,000 operations per second after dirty logging is enabled. This
> commit could increase that by 150% to 2,500 operations per second, but
> that's actually not a very meaningful improvement since, either way,
> guest performance is degraded by 95+% during dirty logging.
>
> On the other hand, if performance goes from 100,000 to 30,000 normally,
> and this commit increases that 30,000 to 75,000 (150%), that's a much
> more meaningful improvement.
>

Yeah, I will provide more insight in the next version.

> >
> > Suggested-by: David Matlack <[email protected]>
> > Signed-off-by: Vipin Sharma <[email protected]>
> > ---
> > arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
> > include/linux/kvm_host.h | 18 ++++++++++++++++++
> > 2 files changed, 26 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 4974fa96deff..376b8dceb3f9 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
> > return spte_set;
> > }
> >
> > -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> > +static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
> > {
> > struct kvm_mmu_page *sp;
> >
> > @@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
> > if (!sp)
> > return NULL;
> >
> > - sp->spt = (void *)__get_free_page(gfp);
> > + sp->spt = kvm_mmu_get_free_page(nid, gfp);
> > +
> > if (!sp->spt) {
> > kmem_cache_free(mmu_page_header_cache, sp);
> > return NULL;
> > @@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > bool shared)
> > {
> > struct kvm_mmu_page *sp;
> > + int nid;
> > +
> > + nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));
> >
> > /*
> > * Since we are allocating while under the MMU lock we have to be
> > @@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > * If this allocation fails we drop the lock and retry with reclaim
> > * allowed.
> > */
> > - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
> > + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
> > if (sp)
> > return sp;
> >
> > @@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
> > write_unlock(&kvm->mmu_lock);
> >
> > iter->yielded = true;
> > - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
> > + sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);
> >
> > if (shared)
> > read_lock(&kvm->mmu_lock);
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index d48064503b88..a262e15ebd19 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
> > int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
> >
> > struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
> > +
> > +/*
> > + * Tells the appropriate NUMA node location of the page table's page based on
> > + * pfn it will point to.
>
> I know what you are trying to say but the wording is a bit awkward. e.g.
> "Tells" instead of "Returns", "location" is redundant, "page table's
> page", etc. Suggest this:
>
> /*
> * Returns an appropriate NUMA node on which to allocate a page table that
> * maps @pfn.
> */
>
> > + *
> > + * Return the nid of the page if pfn is valid and backed by a refcounted page,
> > + * otherwise, return the nearest memory node for the current CPU.
>
> I would just drop this as it's just restating the code, which is already
> very readable.
>

Okay.

> > + */
> > +static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)
> > +{
> > + struct page *page = kvm_pfn_to_refcounted_page(pfn);
> > +
> > + if (page)
> > + return page_to_nid(page);
> > + else
> > + return numa_mem_id();
> > +}
> > +
> > bool kvm_is_zone_device_page(struct page *page);
> >
> > struct kvm_irq_ack_notifier {
> > --
> > 2.39.0.314.g84b9a713c41-goog
> >