MIME-Version: 1.0
References: <20211120045046.3940942-1-seanjc@google.com> <20211120045046.3940942-29-seanjc@google.com>
In-Reply-To: <20211120045046.3940942-29-seanjc@google.com>
From:   Ben Gardon <bgardon@google.com>
Date:   Tue, 23 Nov 2021 12:12:15 -0800
Message-ID: <CANgfPd-U3B+WG6LbVu26ncm=u=TVj60-6mNPEnFkYkSBmSm1Gw@mail.gmail.com>
Subject: Re: [PATCH 28/28] KVM: x86/mmu: Defer TLB flush to caller when
 freeing TDP MMU shadow pages
To:     Sean Christopherson <seanjc@google.com>
Cc:     Paolo Bonzini <pbonzini@redhat.com>,
        Vitaly Kuznetsov <vkuznets@redhat.com>,
        Wanpeng Li <wanpengli@tencent.com>,
        Jim Mattson <jmattson@google.com>,
        Joerg Roedel <joro@8bytes.org>, kvm@vger.kernel.org,
        linux-kernel@vger.kernel.org,
        Hou Wenlong <houwenlong93@linux.alibaba.com>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk

On Fri, Nov 19, 2021 at 8:51 PM Sean Christopherson <seanjc@google.com> wrote:
>
> Defer TLB flushes to the caller when freeing TDP MMU shadow pages instead
> of immediately flushing.  Because the shadow pages are freed in an RCU
> callback, so long as at least one CPU holds RCU, all CPUs are protected.
> For vCPUs running in the guest, i.e. consuming TLB entries, KVM only
> needs to ensure the caller services the pending TLB flush before dropping
> its RCU protections.  I.e. use the caller's RCU as a proxy for all vCPUs
> running in the guest.
>
> Deferring the flushes allows batching flushes, e.g. when installing a
> 1gb hugepage and zapping a pile of SPs, and when zapping an entire root,
> allows skipping the flush entirely (becaues flushes are not needed in
> that case).
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

Reviewed-by: Ben Gardon <bgardon@google.com>

> ---
>  arch/x86/kvm/mmu/mmu.c      | 12 ++++++++++++
>  arch/x86/kvm/mmu/tdp_iter.h |  7 +++----
>  arch/x86/kvm/mmu/tdp_mmu.c  | 23 +++++++++++------------
>  3 files changed, 26 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index ef689b8bab12..7aab9737dffa 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -6237,6 +6237,12 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>         rcu_idx = srcu_read_lock(&kvm->srcu);
>         write_lock(&kvm->mmu_lock);
>
> +       /*
> +        * Zapping TDP MMU shadow pages, including the remote TLB flush, must
> +        * be done under RCU protection, the pages are freed via RCU callback.
> +        */
> +       rcu_read_lock();
> +
>         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
>         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
>         for ( ; to_zap; --to_zap) {
> @@ -6261,12 +6267,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
>
>                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
>                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
> +                       rcu_read_unlock();
> +
>                         cond_resched_rwlock_write(&kvm->mmu_lock);
>                         flush = false;
> +
> +                       rcu_read_lock();
>                 }
>         }
>         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
>
> +       rcu_read_unlock();
> +
>         write_unlock(&kvm->mmu_lock);
>         srcu_read_unlock(&kvm->srcu, rcu_idx);
>  }
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index 0693f1fdb81e..0299703fc844 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -9,10 +9,9 @@
>
>  /*
>   * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
> - * to be zapped while holding mmu_lock for read.  Holding RCU isn't required for
> - * correctness if mmu_lock is held for write, but plumbing "struct kvm" down to
> - * the lower* depths of the TDP MMU just to make lockdep happy is a nightmare,
> - * so all* accesses to SPTEs are must be done under RCU protection.
> + * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
> + * batched without having to collect the list of zapped SPs.  Flows that can
> + * remove SPs must service pending TLB flushes prior to dropping RCU protection.
>   */
>  static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
>  {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 55c16680b927..62cb357b1dff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -433,9 +433,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
>                                     shared);
>         }
>
> -       kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
> -                                          KVM_PAGES_PER_HPAGE(level + 1));
> -
>         call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
>  }
>
> @@ -815,21 +812,14 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
>
>  bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
>  {
> -       u64 old_spte;
> +       u64 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
>
> -       rcu_read_lock();
> -
> -       old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> -       if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) {
> -               rcu_read_unlock();
> +       if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
>                 return false;
> -       }
>
>         __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
>                            sp->gfn, sp->role.level + 1, true, true);
>
> -       rcu_read_unlock();
> -
>         return true;
>  }
>
> @@ -871,6 +861,11 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
>         }
>
>         rcu_read_unlock();
> +
> +       /*
> +        * Because this flows zaps _only_ leaf SPTEs, the caller doesn't need
> +        * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
> +        */
>         return flush;
>  }
>
> @@ -1011,6 +1006,10 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
>                 ret = RET_PF_SPURIOUS;
>         else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
>                 return RET_PF_RETRY;
> +       else if (is_shadow_present_pte(iter->old_spte) &&
> +                !is_last_spte(iter->old_spte, iter->level))
> +               kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
> +                                                  KVM_PAGES_PER_HPAGE(iter->level + 1));
>
>         /*
>          * If the page fault was caused by a write but the page is write
> --
> 2.34.0.rc2.393.gf8c9666880-goog
>