Message-ID: <5819EA34.1020607@intel.com>
Date: Wed, 02 Nov 2016 21:29:24 +0800
From: Jike Song <jike.song@intel.com>
User-Agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:17.0) Gecko/20130801 Thunderbird/17.0.8
MIME-Version: 1.0
To: Kirti Wankhede <kwankhede@nvidia.com>
CC: alex.williamson@redhat.com, pbonzini@redhat.com, kraxel@redhat.com,
        cjia@nvidia.com, qemu-devel@nongnu.org, kvm@vger.kernel.org,
        kevin.tian@intel.com, bjsdjshi@linux.vnet.ibm.com,
        linux-kernel@vger.kernel.org
Subject: Re: [PATCH v10 09/19] vfio iommu type1: Add support for mediated
 devices
References: <1477517366-27871-1-git-send-email-kwankhede@nvidia.com> <1477517366-27871-10-git-send-email-kwankhede@nvidia.com>
In-Reply-To: <1477517366-27871-10-git-send-email-kwankhede@nvidia.com>
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 10064
Lines: 318

On 10/27/2016 05:29 AM, Kirti Wankhede wrote:
> VFIO IOMMU drivers are designed for the devices which are IOMMU capable.
> Mediated device only uses IOMMU APIs, the underlying hardware can be
> managed by an IOMMU domain.
> 
> Aim of this change is:
> - To use most of the code of TYPE1 IOMMU driver for mediated devices
> - To support direct assigned device and mediated device in single module
> 
> This change adds pin and unpin support for mediated device to TYPE1 IOMMU
> backend module. More details:
> - When iommu_group of mediated devices is attached, task structure is
>   cached which is used later to pin pages and page accounting.
> - It keeps track of pinned pages for mediated domain. This data is used to
>   verify unpinning request and to unpin remaining pages while detaching, if
>   there are any.
> - Used existing mechanism for page accounting. If iommu capable domain
>   exist in the container then all pages are already pinned and accounted.
>   Accouting for mdev device is only done if there is no iommu capable
>   domain in the container.
> - Page accouting is updated on hot plug and unplug mdev device and pass
>   through device.
> 
> Tested by assigning below combinations of devices to a single VM:
> - GPU pass through only
> - vGPU device only
> - One GPU pass through and one vGPU device
> - Linux VM hot plug and unplug vGPU device while GPU pass through device
>   exist
> - Linux VM hot plug and unplug GPU pass through device while vGPU device
>   exist
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a
> ---
>  drivers/vfio/vfio_iommu_type1.c | 646 +++++++++++++++++++++++++++++++++++-----
>  1 file changed, 571 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 861ac2a1b0c3..5add11a147e1 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -36,6 +36,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
> +#include <linux/mdev.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -55,18 +56,26 @@ MODULE_PARM_DESC(disable_hugepages,
>  
>  struct vfio_iommu {
>  	struct list_head	domain_list;
> +	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
>  	bool			v2;
>  	bool			nesting;
>  };
>  
> +struct external_addr_space {
> +	struct task_struct	*task;
> +	struct rb_root		pfn_list;	/* pinned Host pfn list */
> +	struct mutex		pfn_list_lock;	/* mutex for pfn_list */
> +};
> +
>  struct vfio_domain {
> -	struct iommu_domain	*domain;
> -	struct list_head	next;
> -	struct list_head	group_list;
> -	int			prot;		/* IOMMU_CACHE */
> -	bool			fgsp;		/* Fine-grained super pages */
> +	struct iommu_domain		*domain;
> +	struct list_head		next;
> +	struct list_head		group_list;
> +	struct external_addr_space	*external_addr_space;
> +	int				prot;	/* IOMMU_CACHE */
> +	bool				fgsp;	/* Fine-grained super pages */
>  };
>  
>  struct vfio_dma {
> @@ -75,6 +84,7 @@ struct vfio_dma {
>  	unsigned long		vaddr;		/* Process virtual addr */
>  	size_t			size;		/* Map size (bytes) */
>  	int			prot;		/* IOMMU_READ/WRITE */
> +	bool			iommu_mapped;
>  };
>  
>  struct vfio_group {
> @@ -83,6 +93,21 @@ struct vfio_group {
>  };
>  
>  /*
> + * Guest RAM pinning working set or DMA target
> + */
> +struct vfio_pfn {
> +	struct rb_node		node;
> +	unsigned long		vaddr;		/* virtual addr */
> +	dma_addr_t		iova;		/* IOVA */
> +	unsigned long		pfn;		/* Host pfn */
> +	int			prot;
> +	atomic_t		ref_count;
> +};
> +
> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)	\
> +					(!list_empty(&iommu->domain_list))
> +
> +/*
>   * This code handles mapping and unmapping of user data buffers
>   * into DMA'ble space using the IOMMU
>   */
> @@ -130,6 +155,101 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
>  	rb_erase(&old->node, &iommu->dma_list);
>  }
>  
> +/*
> + * Helper Functions for host pfn list
> + */
> +
> +static struct vfio_pfn *vfio_find_pfn(struct vfio_domain *domain,
> +				      unsigned long pfn)
> +{
> +	struct rb_node *node;
> +	struct vfio_pfn *vpfn;
> +
> +	node = domain->external_addr_space->pfn_list.rb_node;
> +
> +	while (node) {
> +		vpfn = rb_entry(node, struct vfio_pfn, node);
> +
> +		if (pfn < vpfn->pfn)
> +			node = node->rb_left;
> +		else if (pfn > vpfn->pfn)
> +			node = node->rb_right;
> +		else
> +			return vpfn;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void vfio_link_pfn(struct vfio_domain *domain, struct vfio_pfn *new)
> +{
> +	struct rb_node **link, *parent = NULL;
> +	struct vfio_pfn *vpfn;
> +
> +	link = &domain->external_addr_space->pfn_list.rb_node;
> +	while (*link) {
> +		parent = *link;
> +		vpfn = rb_entry(parent, struct vfio_pfn, node);
> +
> +		if (new->pfn < vpfn->pfn)
> +			link = &(*link)->rb_left;
> +		else
> +			link = &(*link)->rb_right;
> +	}
> +
> +	rb_link_node(&new->node, parent, link);
> +	rb_insert_color(&new->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static void vfio_unlink_pfn(struct vfio_domain *domain, struct vfio_pfn *old)
> +{
> +	rb_erase(&old->node, &domain->external_addr_space->pfn_list);
> +}
> +
> +static int vfio_add_to_pfn_list(struct vfio_domain *domain, unsigned long vaddr,
> +				dma_addr_t iova, unsigned long pfn, int prot)
> +{
> +	struct vfio_pfn *vpfn;
> +
> +	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
> +	if (!vpfn)
> +		return -ENOMEM;
> +
> +	vpfn->vaddr = vaddr;
> +	vpfn->iova = iova;
> +	vpfn->pfn = pfn;
> +	vpfn->prot = prot;
> +	atomic_set(&vpfn->ref_count, 1);
> +	vfio_link_pfn(domain, vpfn);
> +	return 0;
> +}
> +
> +static void vfio_remove_from_pfn_list(struct vfio_domain *domain,
> +				      struct vfio_pfn *vpfn)
> +{
> +	vfio_unlink_pfn(domain, vpfn);
> +	kfree(vpfn);
> +}
> +
> +static int vfio_pfn_account(struct vfio_iommu *iommu, unsigned long pfn)
> +{
> +	struct vfio_pfn *p;
> +	struct vfio_domain *domain = iommu->external_domain;
> +	int ret = 1;
> +
> +	if (!domain)
> +		return 1;
> +
> +	mutex_lock(&domain->external_addr_space->pfn_list_lock);
> +
> +	p = vfio_find_pfn(domain, pfn);
> +	if (p)
> +		ret = 0;
> +
> +	mutex_unlock(&domain->external_addr_space->pfn_list_lock);
> +	return ret;
> +}
> +
>  struct vwork {
>  	struct mm_struct	*mm;
>  	long			npage;
> @@ -270,12 +390,13 @@ static int vaddr_get_pfn(struct mm_struct *remote_mm, unsigned long vaddr,
>   * the iommu can only map chunks of consecutive pfns anyway, so get the
>   * first page and all consecutive pages with the same locking.
>   */
> -static long vfio_pin_pages(unsigned long vaddr, long npage,
> -			   int prot, unsigned long *pfn_base)
> +static long __vfio_pin_pages_remote(struct vfio_iommu *iommu,
> +				    unsigned long vaddr, long npage,
> +				    int prot, unsigned long *pfn_base)
>  {
>  	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>  	bool lock_cap = capable(CAP_IPC_LOCK);
> -	long ret, i;
> +	long ret, i, lock_acct = 0;
>  	bool rsvd;
>  
>  	if (!current->mm)
> @@ -285,9 +406,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	if (ret)
>  		return ret;
>  
> +	lock_acct = vfio_pfn_account(iommu, *pfn_base);
> +
>  	rsvd = is_invalid_reserved_pfn(*pfn_base);
>  
> -	if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) {
> +	if (!rsvd && !lock_cap && current->mm->locked_vm + lock_acct > limit) {
>  		put_pfn(*pfn_base, prot);
>  		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
>  			limit << PAGE_SHIFT);
> @@ -314,8 +437,10 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  			break;
>  		}
>  
> +		lock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		if (!rsvd && !lock_cap &&
> -		    current->mm->locked_vm + i + 1 > limit) {
> +		    current->mm->locked_vm + lock_acct > limit) {
>  			put_pfn(pfn, prot);
>  			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>  				__func__, limit << PAGE_SHIFT);
> @@ -329,14 +454,19 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
>  	return i;
>  }
>  
> -static long vfio_unpin_pages(unsigned long pfn, long npage,
> -			     int prot, bool do_accounting)
> +static long __vfio_unpin_pages_remote(struct vfio_iommu *iommu,
> +				      unsigned long pfn, long npage, int prot,
> +				      bool do_accounting)
>  {
> -	unsigned long unlocked = 0;
> +	unsigned long unlocked = 0, unlock_acct = 0;
>  	long i;
>  
> -	for (i = 0; i < npage; i++)
> +	for (i = 0; i < npage; i++) {
> +		if (do_accounting)
> +			unlock_acct += vfio_pfn_account(iommu, pfn);
> +
>  		unlocked += put_pfn(pfn++, prot);
> +	}
>  
>  	if (do_accounting)
>  		vfio_lock_acct(current, -unlocked);
> @@ -344,14 +474,208 @@ static long vfio_unpin_pages(unsigned long pfn, long npage,
>  	return unlocked;
>  }
>  
> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static long __vfio_pin_page_external(struct vfio_domain *domain,
> +				     unsigned long vaddr, int prot,
> +				     unsigned long *pfn_base,
> +				     bool do_accounting)
> +{
> +	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	bool lock_cap = capable(CAP_IPC_LOCK);
> +	long ret;
> +	bool rsvd;
> +	struct task_struct *task = domain->external_addr_space->task;
> +
> +	if (!task->mm)
> +		return -ENODEV;
> +
> +	ret = vaddr_get_pfn(task->mm, vaddr, prot, pfn_base);

Hi Kirti,

In your implementation of vaddr_get_pfn, as long as the 1st argument
is not NULL, you will call get_user_pages_remote other than get_user_pages_fast.

But still there is a possibility here @task is actually @current, thereby
task->mm == current->mm.  That should not be rare, but probably happen
at most times: the QEMU process emulates MMIO read/write, and call
vfio_pin_pages in its process context.

If I read correctly, current implementation enforces a locked GUP, no
matter which process context it is in.

--
Thanks,
Jike