Date:   Wed, 8 Mar 2023 22:33:31 +0200
From:   Zhi Wang <zhi.wang.linux@gmail.com>
To:     Vipin Sharma <vipinsh@google.com>
Cc:     seanjc@google.com, pbonzini@redhat.com, bgardon@google.com,
        dmatlack@google.com, jmattson@google.com, mizhang@google.com,
        kvm@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [Patch v4 03/18] KVM: x86/mmu: Track count of pages in KVM MMU
 page caches globally
Message-ID: <20230308223331.00000234@gmail.com>
In-Reply-To: <20230306224127.1689967-4-vipinsh@google.com>
References: <20230306224127.1689967-1-vipinsh@google.com>
        <20230306224127.1689967-4-vipinsh@google.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit
Precedence: bulk

On Mon,  6 Mar 2023 14:41:12 -0800
Vipin Sharma <vipinsh@google.com> wrote:

> Create a global counter for total number of pages available
> in MMU page caches across all VMs. Add mmu_shadow_page_cache
> pages to this counter.
> 
> This accounting will be used in future commits to shrink MMU caches via
> KVM MMU shrinker.
> 
> Signed-off-by: Vipin Sharma <vipinsh@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h |  5 ++
>  arch/x86/kvm/mmu/mmu.c          | 90 ++++++++++++++++++++++++++++-----
>  arch/x86/kvm/mmu/mmu_internal.h |  2 +
>  arch/x86/kvm/mmu/paging_tmpl.h  | 25 +++++----
>  arch/x86/kvm/mmu/tdp_mmu.c      |  3 +-
>  5 files changed, 100 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ebbe692acf3f..4322c7020d5d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -791,6 +791,11 @@ struct kvm_vcpu_arch {
>  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
>  	struct kvm_mmu_memory_cache mmu_page_header_cache;
>  
> +	/*
> +	 * Protect allocation and release of pages from mmu_shadow_page_cache.
> +	 */
> +	struct mutex mmu_shadow_page_cache_lock;
> +
>  	/*
>  	 * QEMU userspace and the guest each have their own FPU state.
>  	 * In vcpu_run, we switch between the user and guest FPU contexts.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 3a452989f5cd..13f41b7ac280 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -167,6 +167,11 @@ struct kvm_shadow_walk_iterator {
>  static struct kmem_cache *pte_list_desc_cache;
>  struct kmem_cache *mmu_page_header_cache;
>  
> +/*
> + * Global count of unused pages in MMU page caches across all VMs.
> + */
> +static struct percpu_counter kvm_total_unused_cached_pages;
> +
>  static void mmu_spte_set(u64 *sptep, u64 spte);
>  
>  struct kvm_mmu_role_regs {
> @@ -667,6 +672,34 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
>  	}
>  }
>  
> +/**
> + * Caller should hold mutex lock corresponding to cache, if available.
> + */
> +static int mmu_topup_sp_memory_cache(struct kvm_mmu_memory_cache *cache,
> +				     int min)
> +{
> +	int orig_nobjs, r;
> +
> +	orig_nobjs = cache->nobjs;
> +	r = kvm_mmu_topup_memory_cache(cache, min);
> +	if (orig_nobjs != cache->nobjs)
> +		percpu_counter_add(&kvm_total_unused_cached_pages,
> +				   (cache->nobjs - orig_nobjs));
> +
> +	return r;
> +}
> +

Maybe kvm_mmu_topup_shadow_page_cache() would be better?

As a user of kvm_mmu_topup_memory_cache(), mmu_topup_memory_cache() is not
supposed to directly touch the kvm_mmu_memory_cache meta data.

The name "mmu_topup_sp_memory_cache()" seems similar with "mmu_topup_memory_cache()".
Renaming it would make its level self-documenting.

> +/**
> + * Caller should hold mutex lock corresponding to kvm_mmu_memory_cache, if
> + * available.
> + */
> +static void mmu_free_sp_memory_cache(struct kvm_mmu_memory_cache *cache)
> +{
> +	if (cache->nobjs)
> +		percpu_counter_sub(&kvm_total_unused_cached_pages, cache->nobjs);
> +	kvm_mmu_free_memory_cache(cache);
> +}
> +
>  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  {
>  	int r;
> @@ -676,10 +709,11 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  				       1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
>  	if (r)
>  		return r;
> -	r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
> -				       PT64_ROOT_MAX_LEVEL);
> +
> +	r = mmu_topup_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL);
>  	if (r)
>  		return r;
> +
>  	if (maybe_indirect) {
>  		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
>  					       PT64_ROOT_MAX_LEVEL);
> @@ -693,7 +727,9 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
>  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  {
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
> -	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
> +	mmu_free_sp_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
>  	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
>  }
> @@ -2148,6 +2184,7 @@ struct shadow_page_caches {
>  	struct kvm_mmu_memory_cache *page_header_cache;
>  	struct kvm_mmu_memory_cache *shadow_page_cache;
>  	struct kvm_mmu_memory_cache *shadowed_info_cache;
> +	bool count_shadow_page_allocation;
>  };
>  
>  static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
> @@ -2159,7 +2196,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(caches->shadow_page_cache,
> +					    caches->count_shadow_page_allocation);
>  	if (!role.direct)
>  		sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
>  
> @@ -2216,6 +2254,7 @@ static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
>  		.page_header_cache = &vcpu->arch.mmu_page_header_cache,
>  		.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
>  		.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
> +		.count_shadow_page_allocation = true,
>  	};
>  
>  	return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
> @@ -4314,29 +4353,32 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = direct_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -4396,25 +4438,28 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
>  	if (r != RET_PF_INVALID)
>  		return r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);

Can you elaborate more why this lock is required? When will this lock contend?

1) Previously mmu_topup_memory_caches() works fine without a lock.
2) IMHO I was suspecting if this lock seems affects the parallelization
of the TDP MMU fault handling. 

TDP MMU fault handling is intend to be optimized for parallelization fault
handling by taking a read lock and operating the page table via atomic
operations. Multiple fault handling can enter the TDP MMU fault path
because of read_lock(&vcpu->kvm->mmu_lock) below.

W/ this lock, it seems the part of benefit of parallelization is gone
because the lock can contend earlier above. Will this cause performance
regression?

If the lock will not contend above, then I am not sure if we need it.

>  	r = mmu_topup_memory_caches(vcpu, false);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	r = RET_PF_RETRY;
>  	read_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = kvm_tdp_mmu_map(vcpu, fault);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	read_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  #endif
> @@ -5394,6 +5439,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  {
>  	int r;
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
>  	if (r)
>  		goto out;
> @@ -5420,6 +5466,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
>  	 */
>  	static_call(kvm_x86_flush_tlb_current)(vcpu);
>  out:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -5924,6 +5971,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
>  	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
>  
>  	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
> +	mutex_init(&vcpu->arch.mmu_shadow_page_cache_lock);
>  
>  	vcpu->arch.mmu = &vcpu->arch.root_mmu;
>  	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
> @@ -6769,12 +6817,17 @@ int kvm_mmu_vendor_module_init(void)
>  	if (!mmu_page_header_cache)
>  		goto out;
>  
> +	if (percpu_counter_init(&kvm_total_unused_cached_pages, 0, GFP_KERNEL))
> +		goto out;
> +
>  	ret = register_shrinker(&mmu_shrinker, "x86-mmu");
>  	if (ret)
> -		goto out;
> +		goto out_shrinker;
>  
>  	return 0;
>  
> +out_shrinker:
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  out:
>  	mmu_destroy_caches();
>  	return ret;
> @@ -6792,6 +6845,7 @@ void kvm_mmu_vendor_module_exit(void)
>  {
>  	mmu_destroy_caches();
>  	unregister_shrinker(&mmu_shrinker);
> +	percpu_counter_destroy(&kvm_total_unused_cached_pages);
>  }
>  
>  /*
> @@ -6994,3 +7048,11 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
>  	if (kvm->arch.nx_huge_page_recovery_thread)
>  		kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
>  }
> +
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *shadow_page_cache,
> +				bool count_allocation)
> +{
> +	if (count_allocation && shadow_page_cache->nobjs)
> +		percpu_counter_dec(&kvm_total_unused_cached_pages);
> +	return kvm_mmu_memory_cache_alloc(shadow_page_cache);
> +}
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index cc58631e2336..798cfbf0a36b 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -338,5 +338,7 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  
>  void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>  void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> +void *mmu_sp_memory_cache_alloc(struct kvm_mmu_memory_cache *cache,
> +				bool count_allocation);
>  
>  #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 57f0b75c80f9..1dea9be6849d 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -821,9 +821,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		return RET_PF_EMULATE;
>  	}
>  
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	r = mmu_topup_memory_caches(vcpu, true);
>  	if (r)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	vcpu->arch.write_fault_to_shadow_pgtable = false;
>  
> @@ -837,7 +838,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  
>  	r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
>  	if (r != RET_PF_CONTINUE)
> -		return r;
> +		goto out_page_cache_unlock;
>  
>  	/*
>  	 * Do not change pte_access if the pfn is a mmio page, otherwise
> @@ -862,16 +863,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
>  	if (is_page_fault_stale(vcpu, fault))
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
>  	if (r)
> -		goto out_unlock;
> +		goto out_mmu_unlock;
>  	r = FNAME(fetch)(vcpu, fault, &walker);
>  
> -out_unlock:
> +out_mmu_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
>  	kvm_release_pfn_clean(fault->pfn);
> +out_page_cache_unlock:
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	return r;
>  }
>  
> @@ -897,17 +900,18 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  
>  	vcpu_clear_mmio_info(vcpu, gva);
>  
> +	if (!VALID_PAGE(root_hpa)) {
> +		WARN_ON(1);
> +		return;
> +	}
> +
> +	mutex_lock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  	/*
>  	 * No need to check return value here, rmap_can_add() can
>  	 * help us to skip pte prefetch later.
>  	 */
>  	mmu_topup_memory_caches(vcpu, true);
>  
> -	if (!VALID_PAGE(root_hpa)) {
> -		WARN_ON(1);
> -		return;
> -	}
> -
>  	write_lock(&vcpu->kvm->mmu_lock);
>  	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
>  		level = iterator.level;
> @@ -943,6 +947,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
>  			break;
>  	}
>  	write_unlock(&vcpu->kvm->mmu_lock);
> +	mutex_unlock(&vcpu->arch.mmu_shadow_page_cache_lock);
>  }
>  
>  /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 7c25dbf32ecc..fa6eb1e9101e 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -265,7 +265,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
>  	struct kvm_mmu_page *sp;
>  
>  	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
> -	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
> +	sp->spt = mmu_sp_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache,
> +					    true);
>  
>  	return sp;
>  }