Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Fri, 16 Jun 2023 16:09:17 +0800
From:   Yuan Yao <yuan.yao@linux.intel.com>
To:     Yan Zhao <yan.y.zhao@intel.com>
Cc:     kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
        pbonzini@redhat.com, seanjc@google.com, chao.gao@intel.com,
        kai.huang@intel.com, robert.hoo.linux@gmail.com
Subject: Re: [PATCH v3 09/11] KVM: x86/mmu: serialize vCPUs to zap gfn when
 guest MTRRs are honored
Message-ID: <20230616080917.fhekzs2fyhqtbitx@yy-desk-7060>
References: <20230616023101.7019-1-yan.y.zhao@intel.com>
 <20230616023945.7570-1-yan.y.zhao@intel.com>
 <20230616074550.g2ikzbni2rjy7dfw@yy-desk-7060>
 <ZIwROWti5d0sCFwT@yzhao56-desk.sh.intel.com>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <ZIwROWti5d0sCFwT@yzhao56-desk.sh.intel.com>
User-Agent: NeoMutt/20171215
Precedence: bulk

On Fri, Jun 16, 2023 at 03:37:29PM +0800, Yan Zhao wrote:
> On Fri, Jun 16, 2023 at 03:45:50PM +0800, Yuan Yao wrote:
> > > +/*
> > > + * Add @range into kvm->arch.mtrr_zap_list and sort the list in
> > > + * "length" ascending + "start" descending order, so that
> > > + * ranges consuming more zap cycles can be dequeued later and their
> > > + * chances of being found duplicated are increased.
> > > + */
> > > +static void kvm_add_mtrr_zap_list(struct kvm *kvm, struct mtrr_zap_range *range)
> > > +{
> > > +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> > > +	u64 len = range->end - range->start;
> > > +	struct mtrr_zap_range *cur, *n;
> > > +	bool added = false;
> > > +
> > > +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > > +
> > > +	if (list_empty(head)) {
> > > +		list_add(&range->node, head);
> > > +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > > +		return;
> > > +	}
> > > +
> > > +	list_for_each_entry_safe(cur, n, head, node) {
> > > +		u64 cur_len = cur->end - cur->start;
> > > +
> > > +		if (len < cur_len)
> > > +			break;
> > > +
> > > +		if (len > cur_len)
> > > +			continue;
> > > +
> > > +		if (range->start > cur->start)
> > > +			break;
> > > +
> > > +		if (range->start < cur->start)
> > > +			continue;
> > > +
> > > +		/* equal len & start, no need to add */
> > > +		added = true;
> >
> > Possible/worth to ignore the range already covered
> > by queued range ?
>
> I may not get you correctly, but
> the "added" here means an queued range with exactly same start + len
> found, so free and drop adding the new range here.

I mean drop adding three B below if A already in the queue:

|------A--------|
|----B----|

|------A--------|
      |----B----|

|------A--------|
  |----B----|

>
> >
> > > +		kfree(range);
> > > +		break;
> > > +	}
> > > +
> > > +	if (!added)
> > > +		list_add_tail(&range->node, &cur->node);
> > > +
> > > +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > > +}
> > > +
> > > +static void kvm_zap_mtrr_zap_list(struct kvm *kvm)
> > > +{
> > > +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> > > +	struct mtrr_zap_range *cur = NULL;
> > > +
> > > +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > > +
> > > +	while (!list_empty(head)) {
> > > +		u64 start, end;
> > > +
> > > +		cur = list_first_entry(head, typeof(*cur), node);
> > > +		start = cur->start;
> > > +		end = cur->end;
> > > +		list_del(&cur->node);
> > > +		kfree(cur);
> > > +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > > +
> > > +		kvm_zap_gfn_range(kvm, start, end);
> > > +
> > > +		spin_lock(&kvm->arch.mtrr_zap_list_lock);
> > > +	}
> > > +
> > > +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> > > +}
> > > +
> > > +static void kvm_zap_or_wait_mtrr_zap_list(struct kvm *kvm)
> > > +{
> > > +	if (atomic_cmpxchg_acquire(&kvm->arch.mtrr_zapping, 0, 1) == 0) {
> > > +		kvm_zap_mtrr_zap_list(kvm);
> > > +		atomic_set_release(&kvm->arch.mtrr_zapping, 0);
> > > +		return;
> > > +	}
> > > +
> > > +	while (atomic_read(&kvm->arch.mtrr_zapping))
> > > +		cpu_relax();
> > > +}
> > > +
> > > +static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
> > > +				   gfn_t gfn_start, gfn_t gfn_end)
> > > +{
> > > +	struct mtrr_zap_range *range;
> > > +
> > > +	range = kmalloc(sizeof(*range), GFP_KERNEL_ACCOUNT);
> > > +	if (!range)
> > > +		goto fail;
> > > +
> > > +	range->start = gfn_start;
> > > +	range->end = gfn_end;
> > > +
> > > +	kvm_add_mtrr_zap_list(vcpu->kvm, range);
> > > +
> > > +	kvm_zap_or_wait_mtrr_zap_list(vcpu->kvm);
> > > +	return;
> > > +
> > > +fail:
> > > +	kvm_clear_mtrr_zap_list(vcpu->kvm);
> > A very small chance race condition that incorrectly
> > clear the queued ranges which have not been zapped by another thread ?
> > Like below:
> >
> > Thread A                         |  Thread B
> > kvm_add_mtrr_zap_list()          |
> >                                  |  kvm_clear_mtrr_zap_list()
> > kvm_zap_or_wait_mtrr_zap_list()  |
> >
> > Call kvm_clear_mtrr_zap_list() here looks unnecessary, other
> > threads(B here) who put thing in the queue will take care them well.
>
> > > +   kvm_zap_gfn_range(vcpu->kvm, gfn_start, gfn_end);
>
> Yes, if gfn_start and gfn_end here are not 0 and ~0ULL, the
> kvm_clear_mtrr_zap_list() is not necessary.
> Though in reality, they are always 0-~0ULL, I agree dropping the
> kvm_clear_mtrr_zap_list() here is better.
>
> Thanks!