Am 22.01.24 um 04:32 schrieb Xianrong Zhou:
> The vmf_insert_pfn_prot could cause unnecessary double faults
> on a device pfn. Because currently the vmf_insert_pfn_prot does
> not make the pfn writable so the pte entry is normally read-only
> or dirty catching.
What? How do you got to this conclusion?
> The first fault only sets up the pte entry which actually is
> dirty catching. And the second immediate fault to the pfn due
> to first dirty catching when the cpu re-execute the store
> instruction.
It could be that this is done to work around some hw behavior, but not
because of dirty catching.
> Normally if the drivers call vmf_insert_pfn_prot and also supply
> 'pfn_mkwrite' callback within vm_operations_struct which requires
> the pte to be dirty catching then the vmf_insert_pfn_prot and the
> double fault are reasonable. It is not a problem.
Well, as far as I can see that behavior absolutely doesn't make sense.
When pfn_mkwrite is requested then the driver should use PAGE_COPY,
which is exactly what VMWGFX (the only driver using dirty tracking) is
doing.
Everybody else uses PAGE_SHARED which should make the pte writeable
immediately.
Regards,
Christian.
>
> However the most of drivers calling vmf_insert_pfn_prot do not
> supply the 'pfn_mkwrite' callback so that the second fault is
> unnecessary.
>
> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
>
> Signed-off-by: Xianrong Zhou <[email protected]>
> ---
> arch/x86/entry/vdso/vma.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> include/drm/ttm/ttm_bo.h | 3 ++-
> include/linux/mm.h | 2 +-
> mm/memory.c | 14 +++++++++++---
> 10 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> index 7645730dc228..dd2431c2975f 100644
> --- a/arch/x86/entry/vdso/vma.c
> +++ b/arch/x86/entry/vdso/vma.c
> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
> return vmf_insert_pfn_prot(vma, vmf->address,
> __pa(pvti) >> PAGE_SHIFT,
> - pgprot_decrypted(vma->vm_page_prot));
> + pgprot_decrypted(vma->vm_page_prot),
> + true);
> }
> } else if (sym_offset == image->sym_hvclock_page) {
> pfn = hv_get_tsc_pfn();
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 49a5f1c73b3e..adcb20d9e624 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
> }
>
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
>
> drm_dev_exit(idx);
> } else {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 9227f8146a58..c6f13ae6c308 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>
> if (drm_dev_enter(dev, &idx)) {
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
> drm_dev_exit(idx);
> } else {
> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 49c2bcbef129..7e1453762ec9 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
>
> nouveau_bo_del_io_reserve_lru(bo);
> prot = vm_get_page_prot(vma->vm_flags);
> - ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, true);
> nouveau_bo_add_io_reserve_lru(bo);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index 3fec3acdaf28..b21cf00ae162 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
> goto unlock_resv;
>
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> goto unlock_mclk;
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index 4212b8c91dd4..7d14a7d267aa 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> * @num_prefault: Maximum number of prefault pages. The caller may want to
> * specify this based on madvice settings and the size of the GPU object
> * backed by the memory.
> + * @mkwrite: make the pfn or page writable
> *
> * This function inserts one or more page table entries pointing to the
> * memory backing the buffer object, and then returns a return code
> @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> */
> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> pgprot_t prot,
> - pgoff_t num_prefault)
> + pgoff_t num_prefault,
> + bool mkwrite)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct ttm_buffer_object *bo = vma->vm_private_data;
> @@ -263,7 +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> * at arbitrary times while the data is mmap'ed.
> * See vmf_insert_pfn_prot() for a discussion.
> */
> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
>
> /* Never error on prefaulted PTEs */
> if (unlikely((ret & VM_FAULT_ERROR))) {
> @@ -312,7 +314,7 @@ vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
> /* Prefault the entire VMA range right away to avoid further faults */
> for (address = vma->vm_start; address < vma->vm_end;
> address += PAGE_SIZE)
> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
>
> return ret;
> }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> index 74ff2812d66a..bb8e4b641681 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
> * sure the page protection is write-enabled so we don't get
> * a lot of unnecessary write faults.
> */
> - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE) {
> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> - else
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, false);
> + } else {
> prot = vm_get_page_prot(vma->vm_flags);
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, true);
> + }
>
> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> return ret;
>
> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> index 0223a41a64b2..66e293db69ee 100644
> --- a/include/drm/ttm/ttm_bo.h
> +++ b/include/drm/ttm/ttm_bo.h
> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> struct vm_fault *vmf);
> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> pgprot_t prot,
> - pgoff_t num_prefault);
> + pgoff_t num_prefault,
> + bool mkwrite);
> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> void ttm_bo_vm_open(struct vm_area_struct *vma);
> void ttm_bo_vm_close(struct vm_area_struct *vma);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..f8868e28ea04 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> unsigned long pfn);
> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> - unsigned long pfn, pgprot_t pgprot);
> + unsigned long pfn, pgprot_t pgprot, bool mkwrite);
> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
> pfn_t pfn);
> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> diff --git a/mm/memory.c b/mm/memory.c
> index 7e1f4849463a..2c28f1a349ff 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> * @addr: target user address of this page
> * @pfn: source kernel pfn
> * @pgprot: pgprot flags for the inserted page
> + * @mkwrite: make the pfn writable
> *
> * This is exactly like vmf_insert_pfn(), except that it allows drivers
> * to override pgprot on a per-page basis.
> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> * Return: vm_fault_t value.
> */
> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> - unsigned long pfn, pgprot_t pgprot)
> + unsigned long pfn, pgprot_t pgprot, bool mkwrite)
> {
> /*
> * Technically, architectures with pte_special can avoid all these
> @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
>
> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
> - false);
> + mkwrite);
> }
> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>
> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> unsigned long pfn)
> {
> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, false);
> }
> EXPORT_SYMBOL(vmf_insert_pfn);
>
> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> + unsigned long pfn)
> +{
> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, true);
> +}
> +EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> +
> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> {
> /* these checks mirror the abort conditions in vm_normal_page */
[AMD Official Use Only - General]
> > The vmf_insert_pfn_prot could cause unnecessary double faults on a
> > device pfn. Because currently the vmf_insert_pfn_prot does not make
> > the pfn writable so the pte entry is normally read-only or dirty
> > catching.
>
> What? How do you got to this conclusion?
Sorry. I did not mention that this problem only exists on arm64 platform.
Because on arm64 platform the PTE_RDONLY is automatically attached to
the userspace pte entries even through VM_WRITE + VM_SHARE.
The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
vmf_insert_pfn_prot do not make the pte writable passing false @mkwrite
to insert_pfn.
>
> > The first fault only sets up the pte entry which actually is dirty
> > catching. And the second immediate fault to the pfn due to first dirty
> > catching when the cpu re-execute the store instruction.
>
> It could be that this is done to work around some hw behavior, but not
> because of dirty catching.
>
> > Normally if the drivers call vmf_insert_pfn_prot and also supply
> > 'pfn_mkwrite' callback within vm_operations_struct which requires the
> > pte to be dirty catching then the vmf_insert_pfn_prot and the double
> > fault are reasonable. It is not a problem.
>
> Well, as far as I can see that behavior absolutely doesn't make sense.
>
> When pfn_mkwrite is requested then the driver should use PAGE_COPY, which
> is exactly what VMWGFX (the only driver using dirty tracking) is doing.
>
> Everybody else uses PAGE_SHARED which should make the pte writeable
> immediately.
>
> Regards,
> Christian.
>
> >
> > However the most of drivers calling vmf_insert_pfn_prot do not supply
> > the 'pfn_mkwrite' callback so that the second fault is unnecessary.
> >
> > So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair, we
> > should also supply vmf_insert_pfn_mkwrite for drivers as well.
> >
> > Signed-off-by: Xianrong Zhou <[email protected]>
> > ---
> > arch/x86/entry/vdso/vma.c | 3 ++-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> > drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> > drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> > drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> > drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> > drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> > include/drm/ttm/ttm_bo.h | 3 ++-
> > include/linux/mm.h | 2 +-
> > mm/memory.c | 14 +++++++++++---
> > 10 files changed, 30 insertions(+), 16 deletions(-)
> >
> > diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> > index 7645730dc228..dd2431c2975f 100644
> > --- a/arch/x86/entry/vdso/vma.c
> > +++ b/arch/x86/entry/vdso/vma.c
> > @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
> vm_special_mapping *sm,
> > if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
> {
> > return vmf_insert_pfn_prot(vma, vmf->address,
> > __pa(pvti) >> PAGE_SHIFT,
> > - pgprot_decrypted(vma-
> >vm_page_prot));
> > + pgprot_decrypted(vma-
> >vm_page_prot),
> > + true);
> > }
> > } else if (sym_offset == image->sym_hvclock_page) {
> > pfn = hv_get_tsc_pfn();
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > index 49a5f1c73b3e..adcb20d9e624 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> > @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault
> *vmf)
> > }
> >
> > ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >vm_page_prot,
> > - TTM_BO_VM_NUM_PREFAULT);
> > + TTM_BO_VM_NUM_PREFAULT,
> true);
> >
> > drm_dev_exit(idx);
> > } else {
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > index 9227f8146a58..c6f13ae6c308 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault
> > *vmf)
> >
> > if (drm_dev_enter(dev, &idx)) {
> > ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >vm_page_prot,
> > - TTM_BO_VM_NUM_PREFAULT);
> > + TTM_BO_VM_NUM_PREFAULT,
> true);
> > drm_dev_exit(idx);
> > } else {
> > ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
> >vm_page_prot); diff
> > --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> > b/drivers/gpu/drm/nouveau/nouveau_gem.c
> > index 49c2bcbef129..7e1453762ec9 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> > @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault
> > *vmf)
> >
> > nouveau_bo_del_io_reserve_lru(bo);
> > prot = vm_get_page_prot(vma->vm_flags);
> > - ret = ttm_bo_vm_fault_reserved(vmf, prot,
> TTM_BO_VM_NUM_PREFAULT);
> > + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> TTM_BO_VM_NUM_PREFAULT,
> > +true);
> > nouveau_bo_add_io_reserve_lru(bo);
> > if (ret == VM_FAULT_RETRY && !(vmf->flags &
> FAULT_FLAG_RETRY_NOWAIT))
> > return ret;
> > diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
> > b/drivers/gpu/drm/radeon/radeon_gem.c
> > index 3fec3acdaf28..b21cf00ae162 100644
> > --- a/drivers/gpu/drm/radeon/radeon_gem.c
> > +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> > @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault
> *vmf)
> > goto unlock_resv;
> >
> > ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> > - TTM_BO_VM_NUM_PREFAULT);
> > + TTM_BO_VM_NUM_PREFAULT, true);
> > if (ret == VM_FAULT_RETRY && !(vmf->flags &
> FAULT_FLAG_RETRY_NOWAIT))
> > goto unlock_mclk;
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
> 4212b8c91dd4..7d14a7d267aa
> > 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> > @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> > * @num_prefault: Maximum number of prefault pages. The caller may
> want to
> > * specify this based on madvice settings and the size of the GPU object
> > * backed by the memory.
> > + * @mkwrite: make the pfn or page writable
> > *
> > * This function inserts one or more page table entries pointing to the
> > * memory backing the buffer object, and then returns a return code
> > @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> > */
> > vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> > pgprot_t prot,
> > - pgoff_t num_prefault)
> > + pgoff_t num_prefault,
> > + bool mkwrite)
> > {
> > struct vm_area_struct *vma = vmf->vma;
> > struct ttm_buffer_object *bo = vma->vm_private_data; @@ -263,7
> > +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> > * at arbitrary times while the data is mmap'ed.
> > * See vmf_insert_pfn_prot() for a discussion.
> > */
> > - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> > + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
> >
> > /* Never error on prefaulted PTEs */
> > if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7 +314,7
> @@
> > vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
> > /* Prefault the entire VMA range right away to avoid further faults */
> > for (address = vma->vm_start; address < vma->vm_end;
> > address += PAGE_SIZE)
> > - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> > + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
> >
> > return ret;
> > }
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > index 74ff2812d66a..bb8e4b641681 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> > @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault
> *vmf)
> > * sure the page protection is write-enabled so we don't get
> > * a lot of unnecessary write faults.
> > */
> > - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> > + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> {
> > prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> > - else
> > + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
> false);
> > + } else {
> > prot = vm_get_page_prot(vma->vm_flags);
> > + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
> true);
> > + }
> >
> > - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> > if (ret == VM_FAULT_RETRY && !(vmf->flags &
> FAULT_FLAG_RETRY_NOWAIT))
> > return ret;
> >
> > diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h index
> > 0223a41a64b2..66e293db69ee 100644
> > --- a/include/drm/ttm/ttm_bo.h
> > +++ b/include/drm/ttm/ttm_bo.h
> > @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
> ttm_buffer_object *bo,
> > struct vm_fault *vmf);
> > vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> > pgprot_t prot,
> > - pgoff_t num_prefault);
> > + pgoff_t num_prefault,
> > + bool mkwrite);
> > vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> > void ttm_bo_vm_open(struct vm_area_struct *vma);
> > void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
> > a/include/linux/mm.h b/include/linux/mm.h index
> > f5a97dec5169..f8868e28ea04 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct
> *vma, struct page **pages,
> > vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long
> addr,
> > unsigned long pfn);
> > vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned
> long addr,
> > - unsigned long pfn, pgprot_t pgprot);
> > + unsigned long pfn, pgprot_t pgprot, bool mkwrite);
> > vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long
> addr,
> > pfn_t pfn);
> > vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, diff
> > --git a/mm/memory.c b/mm/memory.c index 7e1f4849463a..2c28f1a349ff
> > 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
> vm_area_struct *vma, unsigned long addr,
> > * @addr: target user address of this page
> > * @pfn: source kernel pfn
> > * @pgprot: pgprot flags for the inserted page
> > + * @mkwrite: make the pfn writable
> > *
> > * This is exactly like vmf_insert_pfn(), except that it allows drivers
> > * to override pgprot on a per-page basis.
> > @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
> vm_area_struct *vma, unsigned long addr,
> > * Return: vm_fault_t value.
> > */
> > vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned
> long addr,
> > - unsigned long pfn, pgprot_t pgprot)
> > + unsigned long pfn, pgprot_t pgprot, bool mkwrite)
> > {
> > /*
> > * Technically, architectures with pte_special can avoid all these
> > @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct
> vm_area_struct *vma, unsigned long addr,
> > track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
> >
> > return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
> > - false);
> > + mkwrite);
> > }
> > EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >
> > @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
> > vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long
> addr,
> > unsigned long pfn)
> > {
> > - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> > + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> > +false);
> > }
> > EXPORT_SYMBOL(vmf_insert_pfn);
> >
> > +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned
> long addr,
> > + unsigned long pfn)
> > +{
> > + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> true);
> > +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> > +
> > static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> > {
> > /* these checks mirror the abort conditions in vm_normal_page */
Am 23.01.24 um 09:33 schrieb Zhou, Xianrong:
> [AMD Official Use Only - General]
>
>>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
>>> device pfn. Because currently the vmf_insert_pfn_prot does not make
>>> the pfn writable so the pte entry is normally read-only or dirty
>>> catching.
>> What? How do you got to this conclusion?
> Sorry. I did not mention that this problem only exists on arm64 platform.
Ok, that makes at least a little bit more sense.
> Because on arm64 platform the PTE_RDONLY is automatically attached to
> the userspace pte entries even through VM_WRITE + VM_SHARE.
> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
> vmf_insert_pfn_prot do not make the pte writable passing false @mkwrite
> to insert_pfn.
Question is why is arm64 doing this? As far as I can see they must have
some hardware reason for that.
The mkwrite parameter to insert_pfn() was added by commit b2770da642540
to make insert_pfn() look more like insert_pfn_pmd() so that the DAX
code can insert PTEs which are writable and dirty at the same time.
This is a completely different use case to what you try to use it here
for and that looks extremely fishy to me.
Regards,
Christian.
>
>>> The first fault only sets up the pte entry which actually is dirty
>>> catching. And the second immediate fault to the pfn due to first dirty
>>> catching when the cpu re-execute the store instruction.
>> It could be that this is done to work around some hw behavior, but not
>> because of dirty catching.
>>
>>> Normally if the drivers call vmf_insert_pfn_prot and also supply
>>> 'pfn_mkwrite' callback within vm_operations_struct which requires the
>>> pte to be dirty catching then the vmf_insert_pfn_prot and the double
>>> fault are reasonable. It is not a problem.
>> Well, as far as I can see that behavior absolutely doesn't make sense.
>>
>> When pfn_mkwrite is requested then the driver should use PAGE_COPY, which
>> is exactly what VMWGFX (the only driver using dirty tracking) is doing.
>>
>> Everybody else uses PAGE_SHARED which should make the pte writeable
>> immediately.
>>
>> Regards,
>> Christian.
>>
>>> However the most of drivers calling vmf_insert_pfn_prot do not supply
>>> the 'pfn_mkwrite' callback so that the second fault is unnecessary.
>>>
>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair, we
>>> should also supply vmf_insert_pfn_mkwrite for drivers as well.
>>>
>>> Signed-off-by: Xianrong Zhou <[email protected]>
>>> ---
>>> arch/x86/entry/vdso/vma.c | 3 ++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
>>> include/drm/ttm/ttm_bo.h | 3 ++-
>>> include/linux/mm.h | 2 +-
>>> mm/memory.c | 14 +++++++++++---
>>> 10 files changed, 30 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
>>> index 7645730dc228..dd2431c2975f 100644
>>> --- a/arch/x86/entry/vdso/vma.c
>>> +++ b/arch/x86/entry/vdso/vma.c
>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
>> vm_special_mapping *sm,
>>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
>> {
>>> return vmf_insert_pfn_prot(vma, vmf->address,
>>> __pa(pvti) >> PAGE_SHIFT,
>>> - pgprot_decrypted(vma-
>>> vm_page_prot));
>>> + pgprot_decrypted(vma-
>>> vm_page_prot),
>>> + true);
>>> }
>>> } else if (sym_offset == image->sym_hvclock_page) {
>>> pfn = hv_get_tsc_pfn();
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index 49a5f1c73b3e..adcb20d9e624 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault
>> *vmf)
>>> }
>>>
>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>> vm_page_prot,
>>> - TTM_BO_VM_NUM_PREFAULT);
>>> + TTM_BO_VM_NUM_PREFAULT,
>> true);
>>> drm_dev_exit(idx);
>>> } else {
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>> index 9227f8146a58..c6f13ae6c308 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault
>>> *vmf)
>>>
>>> if (drm_dev_enter(dev, &idx)) {
>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>> vm_page_prot,
>>> - TTM_BO_VM_NUM_PREFAULT);
>>> + TTM_BO_VM_NUM_PREFAULT,
>> true);
>>> drm_dev_exit(idx);
>>> } else {
>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
>>> vm_page_prot); diff
>>> --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> index 49c2bcbef129..7e1453762ec9 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault
>>> *vmf)
>>>
>>> nouveau_bo_del_io_reserve_lru(bo);
>>> prot = vm_get_page_prot(vma->vm_flags);
>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> TTM_BO_VM_NUM_PREFAULT);
>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> TTM_BO_VM_NUM_PREFAULT,
>>> +true);
>>> nouveau_bo_add_io_reserve_lru(bo);
>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> FAULT_FLAG_RETRY_NOWAIT))
>>> return ret;
>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
>>> b/drivers/gpu/drm/radeon/radeon_gem.c
>>> index 3fec3acdaf28..b21cf00ae162 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault
>> *vmf)
>>> goto unlock_resv;
>>>
>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>>> - TTM_BO_VM_NUM_PREFAULT);
>>> + TTM_BO_VM_NUM_PREFAULT, true);
>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> FAULT_FLAG_RETRY_NOWAIT))
>>> goto unlock_mclk;
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
>> 4212b8c91dd4..7d14a7d267aa
>>> 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>> * @num_prefault: Maximum number of prefault pages. The caller may
>> want to
>>> * specify this based on madvice settings and the size of the GPU object
>>> * backed by the memory.
>>> + * @mkwrite: make the pfn or page writable
>>> *
>>> * This function inserts one or more page table entries pointing to the
>>> * memory backing the buffer object, and then returns a return code
>>> @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>> */
>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>> pgprot_t prot,
>>> - pgoff_t num_prefault)
>>> + pgoff_t num_prefault,
>>> + bool mkwrite)
>>> {
>>> struct vm_area_struct *vma = vmf->vma;
>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@ -263,7
>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>> * at arbitrary times while the data is mmap'ed.
>>> * See vmf_insert_pfn_prot() for a discussion.
>>> */
>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
>>>
>>> /* Never error on prefaulted PTEs */
>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7 +314,7
>> @@
>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
>>> /* Prefault the entire VMA range right away to avoid further faults */
>>> for (address = vma->vm_start; address < vma->vm_end;
>>> address += PAGE_SIZE)
>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
>>>
>>> return ret;
>>> }
>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> index 74ff2812d66a..bb8e4b641681 100644
>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault
>> *vmf)
>>> * sure the page protection is write-enabled so we don't get
>>> * a lot of unnecessary write faults.
>>> */
>>> - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
>>> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
>> {
>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>>> - else
>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
>> false);
>>> + } else {
>>> prot = vm_get_page_prot(vma->vm_flags);
>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
>> true);
>>> + }
>>>
>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> FAULT_FLAG_RETRY_NOWAIT))
>>> return ret;
>>>
>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h index
>>> 0223a41a64b2..66e293db69ee 100644
>>> --- a/include/drm/ttm/ttm_bo.h
>>> +++ b/include/drm/ttm/ttm_bo.h
>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
>> ttm_buffer_object *bo,
>>> struct vm_fault *vmf);
>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>> pgprot_t prot,
>>> - pgoff_t num_prefault);
>>> + pgoff_t num_prefault,
>>> + bool mkwrite);
>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
>>> a/include/linux/mm.h b/include/linux/mm.h index
>>> f5a97dec5169..f8868e28ea04 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct
>> *vma, struct page **pages,
>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long
>> addr,
>>> unsigned long pfn);
>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned
>> long addr,
>>> - unsigned long pfn, pgprot_t pgprot);
>>> + unsigned long pfn, pgprot_t pgprot, bool mkwrite);
>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long
>> addr,
>>> pfn_t pfn);
>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, diff
>>> --git a/mm/memory.c b/mm/memory.c index 7e1f4849463a..2c28f1a349ff
>>> 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
>> vm_area_struct *vma, unsigned long addr,
>>> * @addr: target user address of this page
>>> * @pfn: source kernel pfn
>>> * @pgprot: pgprot flags for the inserted page
>>> + * @mkwrite: make the pfn writable
>>> *
>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
>>> * to override pgprot on a per-page basis.
>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
>> vm_area_struct *vma, unsigned long addr,
>>> * Return: vm_fault_t value.
>>> */
>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned
>> long addr,
>>> - unsigned long pfn, pgprot_t pgprot)
>>> + unsigned long pfn, pgprot_t pgprot, bool mkwrite)
>>> {
>>> /*
>>> * Technically, architectures with pte_special can avoid all these
>>> @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct
>> vm_area_struct *vma, unsigned long addr,
>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
>>>
>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
>>> - false);
>>> + mkwrite);
>>> }
>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>>
>>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long
>> addr,
>>> unsigned long pfn)
>>> {
>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>>> +false);
>>> }
>>> EXPORT_SYMBOL(vmf_insert_pfn);
>>>
>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned
>> long addr,
>>> + unsigned long pfn)
>>> +{
>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>> true);
>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
>>> +
>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
>>> {
>>> /* these checks mirror the abort conditions in vm_normal_page */
[AMD Official Use Only - General]
> >>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
> >>> device pfn. Because currently the vmf_insert_pfn_prot does not make
> >>> the pfn writable so the pte entry is normally read-only or dirty
> >>> catching.
> >> What? How do you got to this conclusion?
> > Sorry. I did not mention that this problem only exists on arm64 platform.
>
> Ok, that makes at least a little bit more sense.
>
> > Because on arm64 platform the PTE_RDONLY is automatically attached to
> > the userspace pte entries even through VM_WRITE + VM_SHARE.
> > The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
> > vmf_insert_pfn_prot do not make the pte writable passing false
> > @mkwrite to insert_pfn.
>
> Question is why is arm64 doing this? As far as I can see they must have some
> hardware reason for that.
>
> The mkwrite parameter to insert_pfn() was added by commit
> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd() so that
> the DAX code can insert PTEs which are writable and dirty at the same time.
>
This is one scenario to do so. In fact on arm64 there are many scenarios could
be to do so. So we can let vmf_insert_pfn_prot supporting @mkwrite for drivers
at core layer and let drivers to decide whether or not to make writable and dirty
at one time. The patch did this. Otherwise double faults on arm64 when call
vmf_insert_pfn_prot.
> This is a completely different use case to what you try to use it here for and
> that looks extremely fishy to me.
>
> Regards,
> Christian.
>
> >
> >>> The first fault only sets up the pte entry which actually is dirty
> >>> catching. And the second immediate fault to the pfn due to first
> >>> dirty catching when the cpu re-execute the store instruction.
> >> It could be that this is done to work around some hw behavior, but
> >> not because of dirty catching.
> >>
> >>> Normally if the drivers call vmf_insert_pfn_prot and also supply
> >>> 'pfn_mkwrite' callback within vm_operations_struct which requires
> >>> the pte to be dirty catching then the vmf_insert_pfn_prot and the
> >>> double fault are reasonable. It is not a problem.
> >> Well, as far as I can see that behavior absolutely doesn't make sense.
> >>
> >> When pfn_mkwrite is requested then the driver should use PAGE_COPY,
> >> which is exactly what VMWGFX (the only driver using dirty tracking) is
> doing.
> >>
> >> Everybody else uses PAGE_SHARED which should make the pte writeable
> >> immediately.
> >>
> >> Regards,
> >> Christian.
> >>
> >>> However the most of drivers calling vmf_insert_pfn_prot do not
> >>> supply the 'pfn_mkwrite' callback so that the second fault is unnecessary.
> >>>
> >>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair, we
> >>> should also supply vmf_insert_pfn_mkwrite for drivers as well.
> >>>
> >>> Signed-off-by: Xianrong Zhou <[email protected]>
> >>> ---
> >>> arch/x86/entry/vdso/vma.c | 3 ++-
> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> >>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> >>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> >>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> >>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> >>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> >>> include/drm/ttm/ttm_bo.h | 3 ++-
> >>> include/linux/mm.h | 2 +-
> >>> mm/memory.c | 14 +++++++++++---
> >>> 10 files changed, 30 insertions(+), 16 deletions(-)
> >>>
> >>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> >>> index 7645730dc228..dd2431c2975f 100644
> >>> --- a/arch/x86/entry/vdso/vma.c
> >>> +++ b/arch/x86/entry/vdso/vma.c
> >>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
> >> vm_special_mapping *sm,
> >>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
> >> {
> >>> return vmf_insert_pfn_prot(vma, vmf->address,
> >>> __pa(pvti) >> PAGE_SHIFT,
> >>> - pgprot_decrypted(vma-
> >>> vm_page_prot));
> >>> + pgprot_decrypted(vma-
> >>> vm_page_prot),
> >>> + true);
> >>> }
> >>> } else if (sym_offset == image->sym_hvclock_page) {
> >>> pfn = hv_get_tsc_pfn(); diff --git
> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>> index 49a5f1c73b3e..adcb20d9e624 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
> vm_fault
> >> *vmf)
> >>> }
> >>>
> >>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>> vm_page_prot,
> >>> - TTM_BO_VM_NUM_PREFAULT);
> >>> + TTM_BO_VM_NUM_PREFAULT,
> >> true);
> >>> drm_dev_exit(idx);
> >>> } else {
> >>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>> index 9227f8146a58..c6f13ae6c308 100644
> >>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault
> >>> *vmf)
> >>>
> >>> if (drm_dev_enter(dev, &idx)) {
> >>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>> vm_page_prot,
> >>> - TTM_BO_VM_NUM_PREFAULT);
> >>> + TTM_BO_VM_NUM_PREFAULT,
> >> true);
> >>> drm_dev_exit(idx);
> >>> } else {
> >>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
> >>> vm_page_prot); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>> index 49c2bcbef129..7e1453762ec9 100644
> >>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
> >>> vm_fault
> >>> *vmf)
> >>>
> >>> nouveau_bo_del_io_reserve_lru(bo);
> >>> prot = vm_get_page_prot(vma->vm_flags);
> >>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >> TTM_BO_VM_NUM_PREFAULT);
> >>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >> TTM_BO_VM_NUM_PREFAULT,
> >>> +true);
> >>> nouveau_bo_add_io_reserve_lru(bo);
> >>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >> FAULT_FLAG_RETRY_NOWAIT))
> >>> return ret;
> >>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
> >>> b/drivers/gpu/drm/radeon/radeon_gem.c
> >>> index 3fec3acdaf28..b21cf00ae162 100644
> >>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> >>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> >>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault
> >> *vmf)
> >>> goto unlock_resv;
> >>>
> >>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> >>> - TTM_BO_VM_NUM_PREFAULT);
> >>> + TTM_BO_VM_NUM_PREFAULT, true);
> >>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >> FAULT_FLAG_RETRY_NOWAIT))
> >>> goto unlock_mclk;
> >>>
> >>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
> >> 4212b8c91dd4..7d14a7d267aa
> >>> 100644
> >>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>> * @num_prefault: Maximum number of prefault pages. The caller
> >>> may
> >> want to
> >>> * specify this based on madvice settings and the size of the GPU object
> >>> * backed by the memory.
> >>> + * @mkwrite: make the pfn or page writable
> >>> *
> >>> * This function inserts one or more page table entries pointing to the
> >>> * memory backing the buffer object, and then returns a return
> >>> code @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>> */
> >>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>> pgprot_t prot,
> >>> - pgoff_t num_prefault)
> >>> + pgoff_t num_prefault,
> >>> + bool mkwrite)
> >>> {
> >>> struct vm_area_struct *vma = vmf->vma;
> >>> struct ttm_buffer_object *bo = vma->vm_private_data; @@ -263,7
> >>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>> * at arbitrary times while the data is mmap'ed.
> >>> * See vmf_insert_pfn_prot() for a discussion.
> >>> */
> >>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>> + mkwrite);
> >>>
> >>> /* Never error on prefaulted PTEs */
> >>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7
> >>> +314,7
> >> @@
> >>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t
> prot)
> >>> /* Prefault the entire VMA range right away to avoid further faults */
> >>> for (address = vma->vm_start; address < vma->vm_end;
> >>> address += PAGE_SIZE)
> >>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>> + true);
> >>>
> >>> return ret;
> >>> }
> >>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>> index 74ff2812d66a..bb8e4b641681 100644
> >>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault
> >> *vmf)
> >>> * sure the page protection is write-enabled so we don't get
> >>> * a lot of unnecessary write faults.
> >>> */
> >>> - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> >>> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> >> {
> >>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> >>> - else
> >>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
> >> false);
> >>> + } else {
> >>> prot = vm_get_page_prot(vma->vm_flags);
> >>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
> >> true);
> >>> + }
> >>>
> >>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> >>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >> FAULT_FLAG_RETRY_NOWAIT))
> >>> return ret;
> >>>
> >>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> >>> index 0223a41a64b2..66e293db69ee 100644
> >>> --- a/include/drm/ttm/ttm_bo.h
> >>> +++ b/include/drm/ttm/ttm_bo.h
> >>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
> >> ttm_buffer_object *bo,
> >>> struct vm_fault *vmf);
> >>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>> pgprot_t prot,
> >>> - pgoff_t num_prefault);
> >>> + pgoff_t num_prefault,
> >>> + bool mkwrite);
> >>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> >>> void ttm_bo_vm_open(struct vm_area_struct *vma);
> >>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
> >>> a/include/linux/mm.h b/include/linux/mm.h index
> >>> f5a97dec5169..f8868e28ea04 100644
> >>> --- a/include/linux/mm.h
> >>> +++ b/include/linux/mm.h
> >>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
> vm_area_struct
> >> *vma, struct page **pages,
> >>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
> >>> long
> >> addr,
> >>> unsigned long pfn);
> >>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>> unsigned
> >> long addr,
> >>> - unsigned long pfn, pgprot_t pgprot);
> >>> + unsigned long pfn, pgprot_t pgprot, bool
> >>> + mkwrite);
> >>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned
> >>> long
> >> addr,
> >>> pfn_t pfn);
> >>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> >>> diff --git a/mm/memory.c b/mm/memory.c index
> >>> 7e1f4849463a..2c28f1a349ff
> >>> 100644
> >>> --- a/mm/memory.c
> >>> +++ b/mm/memory.c
> >>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
> >> vm_area_struct *vma, unsigned long addr,
> >>> * @addr: target user address of this page
> >>> * @pfn: source kernel pfn
> >>> * @pgprot: pgprot flags for the inserted page
> >>> + * @mkwrite: make the pfn writable
> >>> *
> >>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
> >>> * to override pgprot on a per-page basis.
> >>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
> >> vm_area_struct *vma, unsigned long addr,
> >>> * Return: vm_fault_t value.
> >>> */
> >>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>> unsigned
> >> long addr,
> >>> - unsigned long pfn, pgprot_t pgprot)
> >>> + unsigned long pfn, pgprot_t pgprot, bool
> >>> + mkwrite)
> >>> {
> >>> /*
> >>> * Technically, architectures with pte_special can avoid all
> >>> these @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct
> >> vm_area_struct *vma, unsigned long addr,
> >>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
> >>>
> >>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
> >>> - false);
> >>> + mkwrite);
> >>> }
> >>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>>
> >>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
> >>> long
> >> addr,
> >>> unsigned long pfn)
> >>> {
> >>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> >>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> >>> +false);
> >>> }
> >>> EXPORT_SYMBOL(vmf_insert_pfn);
> >>>
> >>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
> >>> +unsigned
> >> long addr,
> >>> + unsigned long pfn) {
> >>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> >> true);
> >>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> >>> +
> >>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> >>> {
> >>> /* these checks mirror the abort conditions in vm_normal_page
> >>> */
Am 24.01.24 um 03:43 schrieb Zhou, Xianrong:
> [AMD Official Use Only - General]
>
>>>>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
>>>>> device pfn. Because currently the vmf_insert_pfn_prot does not make
>>>>> the pfn writable so the pte entry is normally read-only or dirty
>>>>> catching.
>>>> What? How do you got to this conclusion?
>>> Sorry. I did not mention that this problem only exists on arm64 platform.
>> Ok, that makes at least a little bit more sense.
>>
>>> Because on arm64 platform the PTE_RDONLY is automatically attached to
>>> the userspace pte entries even through VM_WRITE + VM_SHARE.
>>> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
>>> vmf_insert_pfn_prot do not make the pte writable passing false
>>> @mkwrite to insert_pfn.
>> Question is why is arm64 doing this? As far as I can see they must have some
>> hardware reason for that.
>>
>> The mkwrite parameter to insert_pfn() was added by commit
>> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd() so that
>> the DAX code can insert PTEs which are writable and dirty at the same time.
>>
> This is one scenario to do so. In fact on arm64 there are many scenarios could
> be to do so. So we can let vmf_insert_pfn_prot supporting @mkwrite for drivers
> at core layer and let drivers to decide whether or not to make writable and dirty
> at one time. The patch did this. Otherwise double faults on arm64 when call
> vmf_insert_pfn_prot.
Well, that doesn't answer my question why arm64 is double faulting in
the first place,.
So as long as this isn't sorted out I'm going to reject this patch.
Regards,
Christian.
>
>> This is a completely different use case to what you try to use it here for and
>> that looks extremely fishy to me.
>>
>> Regards,
>> Christian.
>>
>>>>> The first fault only sets up the pte entry which actually is dirty
>>>>> catching. And the second immediate fault to the pfn due to first
>>>>> dirty catching when the cpu re-execute the store instruction.
>>>> It could be that this is done to work around some hw behavior, but
>>>> not because of dirty catching.
>>>>
>>>>> Normally if the drivers call vmf_insert_pfn_prot and also supply
>>>>> 'pfn_mkwrite' callback within vm_operations_struct which requires
>>>>> the pte to be dirty catching then the vmf_insert_pfn_prot and the
>>>>> double fault are reasonable. It is not a problem.
>>>> Well, as far as I can see that behavior absolutely doesn't make sense.
>>>>
>>>> When pfn_mkwrite is requested then the driver should use PAGE_COPY,
>>>> which is exactly what VMWGFX (the only driver using dirty tracking) is
>> doing.
>>>> Everybody else uses PAGE_SHARED which should make the pte writeable
>>>> immediately.
>>>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> However the most of drivers calling vmf_insert_pfn_prot do not
>>>>> supply the 'pfn_mkwrite' callback so that the second fault is unnecessary.
>>>>>
>>>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair, we
>>>>> should also supply vmf_insert_pfn_mkwrite for drivers as well.
>>>>>
>>>>> Signed-off-by: Xianrong Zhou <[email protected]>
>>>>> ---
>>>>> arch/x86/entry/vdso/vma.c | 3 ++-
>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
>>>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>>>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
>>>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
>>>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
>>>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
>>>>> include/drm/ttm/ttm_bo.h | 3 ++-
>>>>> include/linux/mm.h | 2 +-
>>>>> mm/memory.c | 14 +++++++++++---
>>>>> 10 files changed, 30 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
>>>>> index 7645730dc228..dd2431c2975f 100644
>>>>> --- a/arch/x86/entry/vdso/vma.c
>>>>> +++ b/arch/x86/entry/vdso/vma.c
>>>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
>>>> vm_special_mapping *sm,
>>>>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
>>>> {
>>>>> return vmf_insert_pfn_prot(vma, vmf->address,
>>>>> __pa(pvti) >> PAGE_SHIFT,
>>>>> - pgprot_decrypted(vma-
>>>>> vm_page_prot));
>>>>> + pgprot_decrypted(vma-
>>>>> vm_page_prot),
>>>>> + true);
>>>>> }
>>>>> } else if (sym_offset == image->sym_hvclock_page) {
>>>>> pfn = hv_get_tsc_pfn(); diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>> index 49a5f1c73b3e..adcb20d9e624 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
>> vm_fault
>>>> *vmf)
>>>>> }
>>>>>
>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>>>> vm_page_prot,
>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>> + TTM_BO_VM_NUM_PREFAULT,
>>>> true);
>>>>> drm_dev_exit(idx);
>>>>> } else {
>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>> index 9227f8146a58..c6f13ae6c308 100644
>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault
>>>>> *vmf)
>>>>>
>>>>> if (drm_dev_enter(dev, &idx)) {
>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>>>> vm_page_prot,
>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>> + TTM_BO_VM_NUM_PREFAULT,
>>>> true);
>>>>> drm_dev_exit(idx);
>>>>> } else {
>>>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
>>>>> vm_page_prot); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>> index 49c2bcbef129..7e1453762ec9 100644
>>>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
>>>>> vm_fault
>>>>> *vmf)
>>>>>
>>>>> nouveau_bo_del_io_reserve_lru(bo);
>>>>> prot = vm_get_page_prot(vma->vm_flags);
>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>> TTM_BO_VM_NUM_PREFAULT);
>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>> TTM_BO_VM_NUM_PREFAULT,
>>>>> +true);
>>>>> nouveau_bo_add_io_reserve_lru(bo);
>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>> return ret;
>>>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
>>>>> b/drivers/gpu/drm/radeon/radeon_gem.c
>>>>> index 3fec3acdaf28..b21cf00ae162 100644
>>>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>>>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>>>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault
>>>> *vmf)
>>>>> goto unlock_resv;
>>>>>
>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>> + TTM_BO_VM_NUM_PREFAULT, true);
>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>> goto unlock_mclk;
>>>>>
>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
>>>> 4212b8c91dd4..7d14a7d267aa
>>>>> 100644
>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>>>> * @num_prefault: Maximum number of prefault pages. The caller
>>>>> may
>>>> want to
>>>>> * specify this based on madvice settings and the size of the GPU object
>>>>> * backed by the memory.
>>>>> + * @mkwrite: make the pfn or page writable
>>>>> *
>>>>> * This function inserts one or more page table entries pointing to the
>>>>> * memory backing the buffer object, and then returns a return
>>>>> code @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>>>> */
>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>>> pgprot_t prot,
>>>>> - pgoff_t num_prefault)
>>>>> + pgoff_t num_prefault,
>>>>> + bool mkwrite)
>>>>> {
>>>>> struct vm_area_struct *vma = vmf->vma;
>>>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@ -263,7
>>>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>>> * at arbitrary times while the data is mmap'ed.
>>>>> * See vmf_insert_pfn_prot() for a discussion.
>>>>> */
>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>>>>> + mkwrite);
>>>>>
>>>>> /* Never error on prefaulted PTEs */
>>>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7
>>>>> +314,7
>>>> @@
>>>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t
>> prot)
>>>>> /* Prefault the entire VMA range right away to avoid further faults */
>>>>> for (address = vma->vm_start; address < vma->vm_end;
>>>>> address += PAGE_SIZE)
>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>>>>> + true);
>>>>>
>>>>> return ret;
>>>>> }
>>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>> index 74ff2812d66a..bb8e4b641681 100644
>>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault
>>>> *vmf)
>>>>> * sure the page protection is write-enabled so we don't get
>>>>> * a lot of unnecessary write faults.
>>>>> */
>>>>> - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
>>>>> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
>>>> {
>>>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>>>>> - else
>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
>>>> false);
>>>>> + } else {
>>>>> prot = vm_get_page_prot(vma->vm_flags);
>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault,
>>>> true);
>>>>> + }
>>>>>
>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>> return ret;
>>>>>
>>>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
>>>>> index 0223a41a64b2..66e293db69ee 100644
>>>>> --- a/include/drm/ttm/ttm_bo.h
>>>>> +++ b/include/drm/ttm/ttm_bo.h
>>>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
>>>> ttm_buffer_object *bo,
>>>>> struct vm_fault *vmf);
>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>>> pgprot_t prot,
>>>>> - pgoff_t num_prefault);
>>>>> + pgoff_t num_prefault,
>>>>> + bool mkwrite);
>>>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>>>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
>>>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
>>>>> a/include/linux/mm.h b/include/linux/mm.h index
>>>>> f5a97dec5169..f8868e28ea04 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
>> vm_area_struct
>>>> *vma, struct page **pages,
>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>>>>> long
>>>> addr,
>>>>> unsigned long pfn);
>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>>>>> unsigned
>>>> long addr,
>>>>> - unsigned long pfn, pgprot_t pgprot);
>>>>> + unsigned long pfn, pgprot_t pgprot, bool
>>>>> + mkwrite);
>>>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned
>>>>> long
>>>> addr,
>>>>> pfn_t pfn);
>>>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
>>>>> diff --git a/mm/memory.c b/mm/memory.c index
>>>>> 7e1f4849463a..2c28f1a349ff
>>>>> 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>> * @addr: target user address of this page
>>>>> * @pfn: source kernel pfn
>>>>> * @pgprot: pgprot flags for the inserted page
>>>>> + * @mkwrite: make the pfn writable
>>>>> *
>>>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
>>>>> * to override pgprot on a per-page basis.
>>>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>> * Return: vm_fault_t value.
>>>>> */
>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>>>>> unsigned
>>>> long addr,
>>>>> - unsigned long pfn, pgprot_t pgprot)
>>>>> + unsigned long pfn, pgprot_t pgprot, bool
>>>>> + mkwrite)
>>>>> {
>>>>> /*
>>>>> * Technically, architectures with pte_special can avoid all
>>>>> these @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
>>>>>
>>>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
>>>>> - false);
>>>>> + mkwrite);
>>>>> }
>>>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>>>>
>>>>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>>>>> long
>>>> addr,
>>>>> unsigned long pfn)
>>>>> {
>>>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>>>>> +false);
>>>>> }
>>>>> EXPORT_SYMBOL(vmf_insert_pfn);
>>>>>
>>>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
>>>>> +unsigned
>>>> long addr,
>>>>> + unsigned long pfn) {
>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>>>> true);
>>>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
>>>>> +
>>>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
>>>>> {
>>>>> /* these checks mirror the abort conditions in vm_normal_page
>>>>> */
[AMD Official Use Only - General]
> >>>>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
> >>>>> device pfn. Because currently the vmf_insert_pfn_prot does not
> >>>>> make the pfn writable so the pte entry is normally read-only or
> >>>>> dirty catching.
> >>>> What? How do you got to this conclusion?
> >>> Sorry. I did not mention that this problem only exists on arm64 platform.
> >> Ok, that makes at least a little bit more sense.
> >>
> >>> Because on arm64 platform the PTE_RDONLY is automatically attached
> >>> to the userspace pte entries even through VM_WRITE + VM_SHARE.
> >>> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
> >>> vmf_insert_pfn_prot do not make the pte writable passing false
> >>> @mkwrite to insert_pfn.
> >> Question is why is arm64 doing this? As far as I can see they must
> >> have some hardware reason for that.
> >>
> >> The mkwrite parameter to insert_pfn() was added by commit
> >> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd() so
> >> that the DAX code can insert PTEs which are writable and dirty at the same
> time.
> >>
> > This is one scenario to do so. In fact on arm64 there are many
> > scenarios could be to do so. So we can let vmf_insert_pfn_prot
> > supporting @mkwrite for drivers at core layer and let drivers to
> > decide whether or not to make writable and dirty at one time. The
> > patch did this. Otherwise double faults on arm64 when call
> vmf_insert_pfn_prot.
>
> Well, that doesn't answer my question why arm64 is double faulting in the
> first place,.
>
Eh.
On arm64 When userspace mmap() with PROT_WRITE and MAP_SHARED the
vma->vm_page_prot has the PTE_RDONLY and PTE_WRITE within
PAGE_SHARED_EXEC. (seeing arm64 protection_map)
When write the userspace virtual address the first fault happen and call
into driver's .fault->ttm_bo_vm_fault_reserved->vmf_insert_pfn_prot->insert_pfn.
The insert_pfn will establish the pte entry. However the vmf_insert_pfn_prot
pass false @mkwrite to insert_pfn by default and so insert_pfn could not make
the pfn writable and it do not call maybe_mkwrite(pte_mkdirty(entry), vma)
to clear the PTE_RDONLY bit. So the pte entry is actually write protection for mmu.
So when the first fault return and re-execute the store instruction the second
fault happen again. And in second fault it just only do pte_mkdirty(entry) which
clear the PTE_RDONLY.
I think so and hope no wrong.
> So as long as this isn't sorted out I'm going to reject this patch.
>
> Regards,
> Christian.
>
> >
> >> This is a completely different use case to what you try to use it
> >> here for and that looks extremely fishy to me.
> >>
> >> Regards,
> >> Christian.
> >>
> >>>>> The first fault only sets up the pte entry which actually is dirty
> >>>>> catching. And the second immediate fault to the pfn due to first
> >>>>> dirty catching when the cpu re-execute the store instruction.
> >>>> It could be that this is done to work around some hw behavior, but
> >>>> not because of dirty catching.
> >>>>
> >>>>> Normally if the drivers call vmf_insert_pfn_prot and also supply
> >>>>> 'pfn_mkwrite' callback within vm_operations_struct which requires
> >>>>> the pte to be dirty catching then the vmf_insert_pfn_prot and the
> >>>>> double fault are reasonable. It is not a problem.
> >>>> Well, as far as I can see that behavior absolutely doesn't make sense.
> >>>>
> >>>> When pfn_mkwrite is requested then the driver should use PAGE_COPY,
> >>>> which is exactly what VMWGFX (the only driver using dirty tracking)
> >>>> is
> >> doing.
> >>>> Everybody else uses PAGE_SHARED which should make the pte writeable
> >>>> immediately.
> >>>>
> >>>> Regards,
> >>>> Christian.
> >>>>
> >>>>> However the most of drivers calling vmf_insert_pfn_prot do not
> >>>>> supply the 'pfn_mkwrite' callback so that the second fault is
> unnecessary.
> >>>>>
> >>>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
> >>>>> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
> >>>>>
> >>>>> Signed-off-by: Xianrong Zhou <[email protected]>
> >>>>> ---
> >>>>> arch/x86/entry/vdso/vma.c | 3 ++-
> >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> >>>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> >>>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> >>>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> >>>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> >>>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> >>>>> include/drm/ttm/ttm_bo.h | 3 ++-
> >>>>> include/linux/mm.h | 2 +-
> >>>>> mm/memory.c | 14 +++++++++++---
> >>>>> 10 files changed, 30 insertions(+), 16 deletions(-)
> >>>>>
> >>>>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> >>>>> index 7645730dc228..dd2431c2975f 100644
> >>>>> --- a/arch/x86/entry/vdso/vma.c
> >>>>> +++ b/arch/x86/entry/vdso/vma.c
> >>>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
> >>>> vm_special_mapping *sm,
> >>>>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
> >>>> {
> >>>>> return vmf_insert_pfn_prot(vma, vmf->address,
> >>>>> __pa(pvti) >> PAGE_SHIFT,
> >>>>> - pgprot_decrypted(vma-
> >>>>> vm_page_prot));
> >>>>> + pgprot_decrypted(vma-
> >>>>> vm_page_prot),
> >>>>> + true);
> >>>>> }
> >>>>> } else if (sym_offset == image->sym_hvclock_page) {
> >>>>> pfn = hv_get_tsc_pfn(); diff --git
> >>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>> index 49a5f1c73b3e..adcb20d9e624 100644
> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
> >> vm_fault
> >>>> *vmf)
> >>>>> }
> >>>>>
> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>>>> vm_page_prot,
> >>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>> + TTM_BO_VM_NUM_PREFAULT,
> >>>> true);
> >>>>> drm_dev_exit(idx);
> >>>>> } else {
> >>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>> index 9227f8146a58..c6f13ae6c308 100644
> >>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct
> >>>>> vm_fault
> >>>>> *vmf)
> >>>>>
> >>>>> if (drm_dev_enter(dev, &idx)) {
> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>>>> vm_page_prot,
> >>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>> + TTM_BO_VM_NUM_PREFAULT,
> >>>> true);
> >>>>> drm_dev_exit(idx);
> >>>>> } else {
> >>>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
> >>>>> vm_page_prot); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>> index 49c2bcbef129..7e1453762ec9 100644
> >>>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
> >>>>> vm_fault
> >>>>> *vmf)
> >>>>>
> >>>>> nouveau_bo_del_io_reserve_lru(bo);
> >>>>> prot = vm_get_page_prot(vma->vm_flags);
> >>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>> TTM_BO_VM_NUM_PREFAULT);
> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>> TTM_BO_VM_NUM_PREFAULT,
> >>>>> +true);
> >>>>> nouveau_bo_add_io_reserve_lru(bo);
> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>> return ret;
> >>>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>> b/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>> index 3fec3acdaf28..b21cf00ae162 100644
> >>>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct
> >>>>> vm_fault
> >>>> *vmf)
> >>>>> goto unlock_resv;
> >>>>>
> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> >>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>> + TTM_BO_VM_NUM_PREFAULT, true);
> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>> goto unlock_mclk;
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
> >>>> 4212b8c91dd4..7d14a7d267aa
> >>>>> 100644
> >>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>>>> * @num_prefault: Maximum number of prefault pages. The caller
> >>>>> may
> >>>> want to
> >>>>> * specify this based on madvice settings and the size of the GPU
> object
> >>>>> * backed by the memory.
> >>>>> + * @mkwrite: make the pfn or page writable
> >>>>> *
> >>>>> * This function inserts one or more page table entries pointing to the
> >>>>> * memory backing the buffer object, and then returns a return
> >>>>> code @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>>>> */
> >>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>>>> pgprot_t prot,
> >>>>> - pgoff_t num_prefault)
> >>>>> + pgoff_t num_prefault,
> >>>>> + bool mkwrite)
> >>>>> {
> >>>>> struct vm_area_struct *vma = vmf->vma;
> >>>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@
> >>>>> -263,7
> >>>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault
> >>>>> +*vmf,
> >>>>> * at arbitrary times while the data is mmap'ed.
> >>>>> * See vmf_insert_pfn_prot() for a discussion.
> >>>>> */
> >>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>>>> + mkwrite);
> >>>>>
> >>>>> /* Never error on prefaulted PTEs */
> >>>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7
> >>>>> +314,7
> >>>> @@
> >>>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t
> >> prot)
> >>>>> /* Prefault the entire VMA range right away to avoid further faults */
> >>>>> for (address = vma->vm_start; address < vma->vm_end;
> >>>>> address += PAGE_SIZE)
> >>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>>>> + true);
> >>>>>
> >>>>> return ret;
> >>>>> }
> >>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>> index 74ff2812d66a..bb8e4b641681 100644
> >>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct
> vm_fault
> >>>> *vmf)
> >>>>> * sure the page protection is write-enabled so we don't get
> >>>>> * a lot of unnecessary write faults.
> >>>>> */
> >>>>> - if (vbo->dirty && vbo->dirty->method ==
> VMW_BO_DIRTY_MKWRITE)
> >>>>> + if (vbo->dirty && vbo->dirty->method ==
> VMW_BO_DIRTY_MKWRITE)
> >>>> {
> >>>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> >>>>> - else
> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>> + num_prefault,
> >>>> false);
> >>>>> + } else {
> >>>>> prot = vm_get_page_prot(vma->vm_flags);
> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>> + num_prefault,
> >>>> true);
> >>>>> + }
> >>>>>
> >>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>> return ret;
> >>>>>
> >>>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> >>>>> index 0223a41a64b2..66e293db69ee 100644
> >>>>> --- a/include/drm/ttm/ttm_bo.h
> >>>>> +++ b/include/drm/ttm/ttm_bo.h
> >>>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
> >>>> ttm_buffer_object *bo,
> >>>>> struct vm_fault *vmf);
> >>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>>>> pgprot_t prot,
> >>>>> - pgoff_t num_prefault);
> >>>>> + pgoff_t num_prefault,
> >>>>> + bool mkwrite);
> >>>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> >>>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
> >>>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
> >>>>> a/include/linux/mm.h b/include/linux/mm.h index
> >>>>> f5a97dec5169..f8868e28ea04 100644
> >>>>> --- a/include/linux/mm.h
> >>>>> +++ b/include/linux/mm.h
> >>>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
> >> vm_area_struct
> >>>> *vma, struct page **pages,
> >>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
> >>>>> long
> >>>> addr,
> >>>>> unsigned long pfn);
> >>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>>>> unsigned
> >>>> long addr,
> >>>>> - unsigned long pfn, pgprot_t pgprot);
> >>>>> + unsigned long pfn, pgprot_t pgprot, bool
> >>>>> + mkwrite);
> >>>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
> >>>>> unsigned long
> >>>> addr,
> >>>>> pfn_t pfn);
> >>>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct
> >>>>> *vma, diff --git a/mm/memory.c b/mm/memory.c index
> >>>>> 7e1f4849463a..2c28f1a349ff
> >>>>> 100644
> >>>>> --- a/mm/memory.c
> >>>>> +++ b/mm/memory.c
> >>>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
> >>>> vm_area_struct *vma, unsigned long addr,
> >>>>> * @addr: target user address of this page
> >>>>> * @pfn: source kernel pfn
> >>>>> * @pgprot: pgprot flags for the inserted page
> >>>>> + * @mkwrite: make the pfn writable
> >>>>> *
> >>>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
> >>>>> * to override pgprot on a per-page basis.
> >>>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
> >>>> vm_area_struct *vma, unsigned long addr,
> >>>>> * Return: vm_fault_t value.
> >>>>> */
> >>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>>>> unsigned
> >>>> long addr,
> >>>>> - unsigned long pfn, pgprot_t pgprot)
> >>>>> + unsigned long pfn, pgprot_t pgprot, bool
> >>>>> + mkwrite)
> >>>>> {
> >>>>> /*
> >>>>> * Technically, architectures with pte_special can avoid all
> >>>>> these @@ -2246,7 +2247,7 @@ vm_fault_t
> vmf_insert_pfn_prot(struct
> >>>> vm_area_struct *vma, unsigned long addr,
> >>>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn,
> >>>>> PFN_DEV));
> >>>>>
> >>>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV),
> pgprot,
> >>>>> - false);
> >>>>> + mkwrite);
> >>>>> }
> >>>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>>>>
> >>>>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
> >>>>> long
> >>>> addr,
> >>>>> unsigned long pfn)
> >>>>> {
> >>>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> >>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> >>>>> +false);
> >>>>> }
> >>>>> EXPORT_SYMBOL(vmf_insert_pfn);
> >>>>>
> >>>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
> >>>>> +unsigned
> >>>> long addr,
> >>>>> + unsigned long pfn) {
> >>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
> >>>> true);
> >>>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> >>>>> +
> >>>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> >>>>> {
> >>>>> /* these checks mirror the abort conditions in
> >>>>> vm_normal_page */
"Zhou, Xianrong" <[email protected]> writes:
> [AMD Official Use Only - General]
>
>> >>>>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
>> >>>>> device pfn. Because currently the vmf_insert_pfn_prot does not
>> >>>>> make the pfn writable so the pte entry is normally read-only or
>> >>>>> dirty catching.
>> >>>> What? How do you got to this conclusion?
>> >>> Sorry. I did not mention that this problem only exists on arm64 platform.
>> >> Ok, that makes at least a little bit more sense.
>> >>
>> >>> Because on arm64 platform the PTE_RDONLY is automatically attached
>> >>> to the userspace pte entries even through VM_WRITE + VM_SHARE.
>> >>> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
>> >>> vmf_insert_pfn_prot do not make the pte writable passing false
>> >>> @mkwrite to insert_pfn.
>> >> Question is why is arm64 doing this? As far as I can see they must
>> >> have some hardware reason for that.
>> >>
>> >> The mkwrite parameter to insert_pfn() was added by commit
>> >> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd() so
>> >> that the DAX code can insert PTEs which are writable and dirty at the same
>> time.
>> >>
>> > This is one scenario to do so. In fact on arm64 there are many
>> > scenarios could be to do so. So we can let vmf_insert_pfn_prot
>> > supporting @mkwrite for drivers at core layer and let drivers to
>> > decide whether or not to make writable and dirty at one time. The
>> > patch did this. Otherwise double faults on arm64 when call
>> vmf_insert_pfn_prot.
>>
>> Well, that doesn't answer my question why arm64 is double faulting in the
>> first place,.
>>
>
>
> Eh.
>
> On arm64 When userspace mmap() with PROT_WRITE and MAP_SHARED the
> vma->vm_page_prot has the PTE_RDONLY and PTE_WRITE within
> PAGE_SHARED_EXEC. (seeing arm64 protection_map)
>
> When write the userspace virtual address the first fault happen and call
> into driver's .fault->ttm_bo_vm_fault_reserved->vmf_insert_pfn_prot->insert_pfn.
> The insert_pfn will establish the pte entry. However the vmf_insert_pfn_prot
> pass false @mkwrite to insert_pfn by default and so insert_pfn could not make
> the pfn writable and it do not call maybe_mkwrite(pte_mkdirty(entry), vma)
> to clear the PTE_RDONLY bit. So the pte entry is actually write protection for mmu.
> So when the first fault return and re-execute the store instruction the second
> fault happen again. And in second fault it just only do pte_mkdirty(entry) which
> clear the PTE_RDONLY.
It depends if the ARM64 CPU in question supports hardware dirty bit
management (DBM). If that is the case and PTE_DBM (ie. PTE_WRITE) is set
HW will automatically clear PTE_RDONLY bit to mark the entry dirty
instead of raising a write fault. So you shouldn't see a double fault if
PTE_DBM/WRITE is set.
On ARM64 you can kind of think of PTE_RDONLY as the HW dirty bit and
PTE_DBM as the read/write permission bit with SW being responsible for
updating PTE_RDONLY via the fault handler if DBM is not supported by HW.
At least that's my understanding from having hacked on this in the
past. You can see all this weirdness happening in the definitions of
pte_dirty() and pte_write() for ARM64.
> I think so and hope no wrong.
>
>> So as long as this isn't sorted out I'm going to reject this patch.
>>
>> Regards,
>> Christian.
>>
>> >
>> >> This is a completely different use case to what you try to use it
>> >> here for and that looks extremely fishy to me.
>> >>
>> >> Regards,
>> >> Christian.
>> >>
>> >>>>> The first fault only sets up the pte entry which actually is dirty
>> >>>>> catching. And the second immediate fault to the pfn due to first
>> >>>>> dirty catching when the cpu re-execute the store instruction.
>> >>>> It could be that this is done to work around some hw behavior, but
>> >>>> not because of dirty catching.
>> >>>>
>> >>>>> Normally if the drivers call vmf_insert_pfn_prot and also supply
>> >>>>> 'pfn_mkwrite' callback within vm_operations_struct which requires
>> >>>>> the pte to be dirty catching then the vmf_insert_pfn_prot and the
>> >>>>> double fault are reasonable. It is not a problem.
>> >>>> Well, as far as I can see that behavior absolutely doesn't make sense.
>> >>>>
>> >>>> When pfn_mkwrite is requested then the driver should use PAGE_COPY,
>> >>>> which is exactly what VMWGFX (the only driver using dirty tracking)
>> >>>> is
>> >> doing.
>> >>>> Everybody else uses PAGE_SHARED which should make the pte writeable
>> >>>> immediately.
>> >>>>
>> >>>> Regards,
>> >>>> Christian.
>> >>>>
>> >>>>> However the most of drivers calling vmf_insert_pfn_prot do not
>> >>>>> supply the 'pfn_mkwrite' callback so that the second fault is
>> unnecessary.
>> >>>>>
>> >>>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
>> >>>>> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
>> >>>>>
>> >>>>> Signed-off-by: Xianrong Zhou <[email protected]>
>> >>>>> ---
>> >>>>> arch/x86/entry/vdso/vma.c | 3 ++-
>> >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
>> >>>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>> >>>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
>> >>>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
>> >>>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
>> >>>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
>> >>>>> include/drm/ttm/ttm_bo.h | 3 ++-
>> >>>>> include/linux/mm.h | 2 +-
>> >>>>> mm/memory.c | 14 +++++++++++---
>> >>>>> 10 files changed, 30 insertions(+), 16 deletions(-)
>> >>>>>
>> >>>>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
>> >>>>> index 7645730dc228..dd2431c2975f 100644
>> >>>>> --- a/arch/x86/entry/vdso/vma.c
>> >>>>> +++ b/arch/x86/entry/vdso/vma.c
>> >>>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
>> >>>> vm_special_mapping *sm,
>> >>>>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
>> >>>> {
>> >>>>> return vmf_insert_pfn_prot(vma, vmf->address,
>> >>>>> __pa(pvti) >> PAGE_SHIFT,
>> >>>>> - pgprot_decrypted(vma-
>> >>>>> vm_page_prot));
>> >>>>> + pgprot_decrypted(vma-
>> >>>>> vm_page_prot),
>> >>>>> + true);
>> >>>>> }
>> >>>>> } else if (sym_offset == image->sym_hvclock_page) {
>> >>>>> pfn = hv_get_tsc_pfn(); diff --git
>> >>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> >>>>> index 49a5f1c73b3e..adcb20d9e624 100644
>> >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> >>>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
>> >> vm_fault
>> >>>> *vmf)
>> >>>>> }
>> >>>>>
>> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>> >>>>> vm_page_prot,
>> >>>>> - TTM_BO_VM_NUM_PREFAULT);
>> >>>>> + TTM_BO_VM_NUM_PREFAULT,
>> >>>> true);
>> >>>>> drm_dev_exit(idx);
>> >>>>> } else {
>> >>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> >>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> >>>>> index 9227f8146a58..c6f13ae6c308 100644
>> >>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> >>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>> >>>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct
>> >>>>> vm_fault
>> >>>>> *vmf)
>> >>>>>
>> >>>>> if (drm_dev_enter(dev, &idx)) {
>> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>> >>>>> vm_page_prot,
>> >>>>> - TTM_BO_VM_NUM_PREFAULT);
>> >>>>> + TTM_BO_VM_NUM_PREFAULT,
>> >>>> true);
>> >>>>> drm_dev_exit(idx);
>> >>>>> } else {
>> >>>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
>> >>>>> vm_page_prot); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> >>>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> >>>>> index 49c2bcbef129..7e1453762ec9 100644
>> >>>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>> >>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>> >>>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
>> >>>>> vm_fault
>> >>>>> *vmf)
>> >>>>>
>> >>>>> nouveau_bo_del_io_reserve_lru(bo);
>> >>>>> prot = vm_get_page_prot(vma->vm_flags);
>> >>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> >>>> TTM_BO_VM_NUM_PREFAULT);
>> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> >>>> TTM_BO_VM_NUM_PREFAULT,
>> >>>>> +true);
>> >>>>> nouveau_bo_add_io_reserve_lru(bo);
>> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> >>>> FAULT_FLAG_RETRY_NOWAIT))
>> >>>>> return ret;
>> >>>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
>> >>>>> b/drivers/gpu/drm/radeon/radeon_gem.c
>> >>>>> index 3fec3acdaf28..b21cf00ae162 100644
>> >>>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>> >>>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>> >>>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct
>> >>>>> vm_fault
>> >>>> *vmf)
>> >>>>> goto unlock_resv;
>> >>>>>
>> >>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>> >>>>> - TTM_BO_VM_NUM_PREFAULT);
>> >>>>> + TTM_BO_VM_NUM_PREFAULT, true);
>> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> >>>> FAULT_FLAG_RETRY_NOWAIT))
>> >>>>> goto unlock_mclk;
>> >>>>>
>> >>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> >>>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
>> >>>> 4212b8c91dd4..7d14a7d267aa
>> >>>>> 100644
>> >>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> >>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>> >>>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>> >>>>> * @num_prefault: Maximum number of prefault pages. The caller
>> >>>>> may
>> >>>> want to
>> >>>>> * specify this based on madvice settings and the size of the GPU
>> object
>> >>>>> * backed by the memory.
>> >>>>> + * @mkwrite: make the pfn or page writable
>> >>>>> *
>> >>>>> * This function inserts one or more page table entries pointing to the
>> >>>>> * memory backing the buffer object, and then returns a return
>> >>>>> code @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>> >>>>> */
>> >>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>> >>>>> pgprot_t prot,
>> >>>>> - pgoff_t num_prefault)
>> >>>>> + pgoff_t num_prefault,
>> >>>>> + bool mkwrite)
>> >>>>> {
>> >>>>> struct vm_area_struct *vma = vmf->vma;
>> >>>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@
>> >>>>> -263,7
>> >>>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault
>> >>>>> +*vmf,
>> >>>>> * at arbitrary times while the data is mmap'ed.
>> >>>>> * See vmf_insert_pfn_prot() for a discussion.
>> >>>>> */
>> >>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>> >>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>> >>>>> + mkwrite);
>> >>>>>
>> >>>>> /* Never error on prefaulted PTEs */
>> >>>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7
>> >>>>> +314,7
>> >>>> @@
>> >>>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t
>> >> prot)
>> >>>>> /* Prefault the entire VMA range right away to avoid further faults */
>> >>>>> for (address = vma->vm_start; address < vma->vm_end;
>> >>>>> address += PAGE_SIZE)
>> >>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>> >>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>> >>>>> + true);
>> >>>>>
>> >>>>> return ret;
>> >>>>> }
>> >>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> >>>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> >>>>> index 74ff2812d66a..bb8e4b641681 100644
>> >>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> >>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>> >>>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct
>> vm_fault
>> >>>> *vmf)
>> >>>>> * sure the page protection is write-enabled so we don't get
>> >>>>> * a lot of unnecessary write faults.
>> >>>>> */
>> >>>>> - if (vbo->dirty && vbo->dirty->method ==
>> VMW_BO_DIRTY_MKWRITE)
>> >>>>> + if (vbo->dirty && vbo->dirty->method ==
>> VMW_BO_DIRTY_MKWRITE)
>> >>>> {
>> >>>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>> >>>>> - else
>> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> >>>>> + num_prefault,
>> >>>> false);
>> >>>>> + } else {
>> >>>>> prot = vm_get_page_prot(vma->vm_flags);
>> >>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>> >>>>> + num_prefault,
>> >>>> true);
>> >>>>> + }
>> >>>>>
>> >>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>> >>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>> >>>> FAULT_FLAG_RETRY_NOWAIT))
>> >>>>> return ret;
>> >>>>>
>> >>>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
>> >>>>> index 0223a41a64b2..66e293db69ee 100644
>> >>>>> --- a/include/drm/ttm/ttm_bo.h
>> >>>>> +++ b/include/drm/ttm/ttm_bo.h
>> >>>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
>> >>>> ttm_buffer_object *bo,
>> >>>>> struct vm_fault *vmf);
>> >>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>> >>>>> pgprot_t prot,
>> >>>>> - pgoff_t num_prefault);
>> >>>>> + pgoff_t num_prefault,
>> >>>>> + bool mkwrite);
>> >>>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>> >>>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
>> >>>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
>> >>>>> a/include/linux/mm.h b/include/linux/mm.h index
>> >>>>> f5a97dec5169..f8868e28ea04 100644
>> >>>>> --- a/include/linux/mm.h
>> >>>>> +++ b/include/linux/mm.h
>> >>>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
>> >> vm_area_struct
>> >>>> *vma, struct page **pages,
>> >>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>> >>>>> long
>> >>>> addr,
>> >>>>> unsigned long pfn);
>> >>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>> >>>>> unsigned
>> >>>> long addr,
>> >>>>> - unsigned long pfn, pgprot_t pgprot);
>> >>>>> + unsigned long pfn, pgprot_t pgprot, bool
>> >>>>> + mkwrite);
>> >>>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
>> >>>>> unsigned long
>> >>>> addr,
>> >>>>> pfn_t pfn);
>> >>>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct
>> >>>>> *vma, diff --git a/mm/memory.c b/mm/memory.c index
>> >>>>> 7e1f4849463a..2c28f1a349ff
>> >>>>> 100644
>> >>>>> --- a/mm/memory.c
>> >>>>> +++ b/mm/memory.c
>> >>>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
>> >>>> vm_area_struct *vma, unsigned long addr,
>> >>>>> * @addr: target user address of this page
>> >>>>> * @pfn: source kernel pfn
>> >>>>> * @pgprot: pgprot flags for the inserted page
>> >>>>> + * @mkwrite: make the pfn writable
>> >>>>> *
>> >>>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
>> >>>>> * to override pgprot on a per-page basis.
>> >>>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
>> >>>> vm_area_struct *vma, unsigned long addr,
>> >>>>> * Return: vm_fault_t value.
>> >>>>> */
>> >>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>> >>>>> unsigned
>> >>>> long addr,
>> >>>>> - unsigned long pfn, pgprot_t pgprot)
>> >>>>> + unsigned long pfn, pgprot_t pgprot, bool
>> >>>>> + mkwrite)
>> >>>>> {
>> >>>>> /*
>> >>>>> * Technically, architectures with pte_special can avoid all
>> >>>>> these @@ -2246,7 +2247,7 @@ vm_fault_t
>> vmf_insert_pfn_prot(struct
>> >>>> vm_area_struct *vma, unsigned long addr,
>> >>>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn,
>> >>>>> PFN_DEV));
>> >>>>>
>> >>>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV),
>> pgprot,
>> >>>>> - false);
>> >>>>> + mkwrite);
>> >>>>> }
>> >>>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>> >>>>>
>> >>>>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
>> >>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>> >>>>> long
>> >>>> addr,
>> >>>>> unsigned long pfn)
>> >>>>> {
>> >>>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
>> >>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>> >>>>> +false);
>> >>>>> }
>> >>>>> EXPORT_SYMBOL(vmf_insert_pfn);
>> >>>>>
>> >>>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
>> >>>>> +unsigned
>> >>>> long addr,
>> >>>>> + unsigned long pfn) {
>> >>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>> >>>> true);
>> >>>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
>> >>>>> +
>> >>>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
>> >>>>> {
>> >>>>> /* these checks mirror the abort conditions in
>> >>>>> vm_normal_page */
Am 24.01.24 um 12:04 schrieb Alistair Popple:
> "Zhou, Xianrong" <[email protected]> writes:
>
>> [AMD Official Use Only - General]
>>
>>>>>>>> The vmf_insert_pfn_prot could cause unnecessary double faults on a
>>>>>>>> device pfn. Because currently the vmf_insert_pfn_prot does not
>>>>>>>> make the pfn writable so the pte entry is normally read-only or
>>>>>>>> dirty catching.
>>>>>>> What? How do you got to this conclusion?
>>>>>> Sorry. I did not mention that this problem only exists on arm64 platform.
>>>>> Ok, that makes at least a little bit more sense.
>>>>>
>>>>>> Because on arm64 platform the PTE_RDONLY is automatically attached
>>>>>> to the userspace pte entries even through VM_WRITE + VM_SHARE.
>>>>>> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot. However
>>>>>> vmf_insert_pfn_prot do not make the pte writable passing false
>>>>>> @mkwrite to insert_pfn.
>>>>> Question is why is arm64 doing this? As far as I can see they must
>>>>> have some hardware reason for that.
>>>>>
>>>>> The mkwrite parameter to insert_pfn() was added by commit
>>>>> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd() so
>>>>> that the DAX code can insert PTEs which are writable and dirty at the same
>>> time.
>>>> This is one scenario to do so. In fact on arm64 there are many
>>>> scenarios could be to do so. So we can let vmf_insert_pfn_prot
>>>> supporting @mkwrite for drivers at core layer and let drivers to
>>>> decide whether or not to make writable and dirty at one time. The
>>>> patch did this. Otherwise double faults on arm64 when call
>>> vmf_insert_pfn_prot.
>>>
>>> Well, that doesn't answer my question why arm64 is double faulting in the
>>> first place,.
>>>
>>
>> Eh.
>>
>> On arm64 When userspace mmap() with PROT_WRITE and MAP_SHARED the
>> vma->vm_page_prot has the PTE_RDONLY and PTE_WRITE within
>> PAGE_SHARED_EXEC. (seeing arm64 protection_map)
Well that's your observation, but not the explanation why arm64 is doing
this.
See this would have quite some negative impact on performance, not only
for gfx drivers but in general.
So either the observation is incorrect or there is a *really* good
reason why arm64 is taking this performance penalty.
>> When write the userspace virtual address the first fault happen and call
>> into driver's .fault->ttm_bo_vm_fault_reserved->vmf_insert_pfn_prot->insert_pfn.
>> The insert_pfn will establish the pte entry. However the vmf_insert_pfn_prot
>> pass false @mkwrite to insert_pfn by default and so insert_pfn could not make
>> the pfn writable and it do not call maybe_mkwrite(pte_mkdirty(entry), vma)
>> to clear the PTE_RDONLY bit. So the pte entry is actually write protection for mmu.
>> So when the first fault return and re-execute the store instruction the second
>> fault happen again. And in second fault it just only do pte_mkdirty(entry) which
>> clear the PTE_RDONLY.
> It depends if the ARM64 CPU in question supports hardware dirty bit
> management (DBM). If that is the case and PTE_DBM (ie. PTE_WRITE) is set
> HW will automatically clear PTE_RDONLY bit to mark the entry dirty
> instead of raising a write fault. So you shouldn't see a double fault if
> PTE_DBM/WRITE is set.
>
> On ARM64 you can kind of think of PTE_RDONLY as the HW dirty bit and
> PTE_DBM as the read/write permission bit with SW being responsible for
> updating PTE_RDONLY via the fault handler if DBM is not supported by HW.
>
> At least that's my understanding from having hacked on this in the
> past. You can see all this weirdness happening in the definitions of
> pte_dirty() and pte_write() for ARM64.
+1
Thanks a lot for that, this was exactly the information I was looking for.
In this light it makes this patch here look unnecessary and questionable
at best.
Xianrong if you have an arm64 platform which really double faults
(confirmed through a debugger for example) then you have to ask why this
platform shows this behavior and not try to work around it.
Behaviors like those usually have a very very good reason and without a
confirmed explanation I'm not allowing any patch in which would disable
stuff like that.
Regards,
Christian.
>
>> I think so and hope no wrong.
>>
>>> So as long as this isn't sorted out I'm going to reject this patch.
>>>
>>> Regards,
>>> Christian.
>>>
>>>>> This is a completely different use case to what you try to use it
>>>>> here for and that looks extremely fishy to me.
>>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>>> The first fault only sets up the pte entry which actually is dirty
>>>>>>>> catching. And the second immediate fault to the pfn due to first
>>>>>>>> dirty catching when the cpu re-execute the store instruction.
>>>>>>> It could be that this is done to work around some hw behavior, but
>>>>>>> not because of dirty catching.
>>>>>>>
>>>>>>>> Normally if the drivers call vmf_insert_pfn_prot and also supply
>>>>>>>> 'pfn_mkwrite' callback within vm_operations_struct which requires
>>>>>>>> the pte to be dirty catching then the vmf_insert_pfn_prot and the
>>>>>>>> double fault are reasonable. It is not a problem.
>>>>>>> Well, as far as I can see that behavior absolutely doesn't make sense.
>>>>>>>
>>>>>>> When pfn_mkwrite is requested then the driver should use PAGE_COPY,
>>>>>>> which is exactly what VMWGFX (the only driver using dirty tracking)
>>>>>>> is
>>>>> doing.
>>>>>>> Everybody else uses PAGE_SHARED which should make the pte writeable
>>>>>>> immediately.
>>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>> However the most of drivers calling vmf_insert_pfn_prot do not
>>>>>>>> supply the 'pfn_mkwrite' callback so that the second fault is
>>> unnecessary.
>>>>>>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
>>>>>>>> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
>>>>>>>>
>>>>>>>> Signed-off-by: Xianrong Zhou <[email protected]>
>>>>>>>> ---
>>>>>>>> arch/x86/entry/vdso/vma.c | 3 ++-
>>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
>>>>>>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
>>>>>>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
>>>>>>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
>>>>>>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
>>>>>>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
>>>>>>>> include/drm/ttm/ttm_bo.h | 3 ++-
>>>>>>>> include/linux/mm.h | 2 +-
>>>>>>>> mm/memory.c | 14 +++++++++++---
>>>>>>>> 10 files changed, 30 insertions(+), 16 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
>>>>>>>> index 7645730dc228..dd2431c2975f 100644
>>>>>>>> --- a/arch/x86/entry/vdso/vma.c
>>>>>>>> +++ b/arch/x86/entry/vdso/vma.c
>>>>>>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
>>>>>>> vm_special_mapping *sm,
>>>>>>>> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
>>>>>>> {
>>>>>>>> return vmf_insert_pfn_prot(vma, vmf->address,
>>>>>>>> __pa(pvti) >> PAGE_SHIFT,
>>>>>>>> - pgprot_decrypted(vma-
>>>>>>>> vm_page_prot));
>>>>>>>> + pgprot_decrypted(vma-
>>>>>>>> vm_page_prot),
>>>>>>>> + true);
>>>>>>>> }
>>>>>>>> } else if (sym_offset == image->sym_hvclock_page) {
>>>>>>>> pfn = hv_get_tsc_pfn(); diff --git
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>>>>> index 49a5f1c73b3e..adcb20d9e624 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>>>>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
>>>>> vm_fault
>>>>>>> *vmf)
>>>>>>>> }
>>>>>>>>
>>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>>>>>>> vm_page_prot,
>>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>>>>> + TTM_BO_VM_NUM_PREFAULT,
>>>>>>> true);
>>>>>>>> drm_dev_exit(idx);
>>>>>>>> } else {
>>>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>>>>> index 9227f8146a58..c6f13ae6c308 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
>>>>>>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct
>>>>>>>> vm_fault
>>>>>>>> *vmf)
>>>>>>>>
>>>>>>>> if (drm_dev_enter(dev, &idx)) {
>>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
>>>>>>>> vm_page_prot,
>>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>>>>> + TTM_BO_VM_NUM_PREFAULT,
>>>>>>> true);
>>>>>>>> drm_dev_exit(idx);
>>>>>>>> } else {
>>>>>>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
>>>>>>>> vm_page_prot); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>>>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>>>>> index 49c2bcbef129..7e1453762ec9 100644
>>>>>>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>>>>>>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
>>>>>>>> vm_fault
>>>>>>>> *vmf)
>>>>>>>>
>>>>>>>> nouveau_bo_del_io_reserve_lru(bo);
>>>>>>>> prot = vm_get_page_prot(vma->vm_flags);
>>>>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>>>>> TTM_BO_VM_NUM_PREFAULT);
>>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>>>>> TTM_BO_VM_NUM_PREFAULT,
>>>>>>>> +true);
>>>>>>>> nouveau_bo_add_io_reserve_lru(bo);
>>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>>>>> return ret;
>>>>>>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
>>>>>>>> b/drivers/gpu/drm/radeon/radeon_gem.c
>>>>>>>> index 3fec3acdaf28..b21cf00ae162 100644
>>>>>>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
>>>>>>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
>>>>>>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct
>>>>>>>> vm_fault
>>>>>>> *vmf)
>>>>>>>> goto unlock_resv;
>>>>>>>>
>>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
>>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
>>>>>>>> + TTM_BO_VM_NUM_PREFAULT, true);
>>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>>>>> goto unlock_mclk;
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
>>>>>>> 4212b8c91dd4..7d14a7d267aa
>>>>>>>> 100644
>>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
>>>>>>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>>>>>>> * @num_prefault: Maximum number of prefault pages. The caller
>>>>>>>> may
>>>>>>> want to
>>>>>>>> * specify this based on madvice settings and the size of the GPU
>>> object
>>>>>>>> * backed by the memory.
>>>>>>>> + * @mkwrite: make the pfn or page writable
>>>>>>>> *
>>>>>>>> * This function inserts one or more page table entries pointing to the
>>>>>>>> * memory backing the buffer object, and then returns a return
>>>>>>>> code @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
>>>>>>>> */
>>>>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>>>>>> pgprot_t prot,
>>>>>>>> - pgoff_t num_prefault)
>>>>>>>> + pgoff_t num_prefault,
>>>>>>>> + bool mkwrite)
>>>>>>>> {
>>>>>>>> struct vm_area_struct *vma = vmf->vma;
>>>>>>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@
>>>>>>>> -263,7
>>>>>>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault
>>>>>>>> +*vmf,
>>>>>>>> * at arbitrary times while the data is mmap'ed.
>>>>>>>> * See vmf_insert_pfn_prot() for a discussion.
>>>>>>>> */
>>>>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>>>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>>>>>>>> + mkwrite);
>>>>>>>>
>>>>>>>> /* Never error on prefaulted PTEs */
>>>>>>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@ -312,7
>>>>>>>> +314,7
>>>>>>> @@
>>>>>>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t
>>>>> prot)
>>>>>>>> /* Prefault the entire VMA range right away to avoid further faults */
>>>>>>>> for (address = vma->vm_start; address < vma->vm_end;
>>>>>>>> address += PAGE_SIZE)
>>>>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
>>>>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
>>>>>>>> + true);
>>>>>>>>
>>>>>>>> return ret;
>>>>>>>> }
>>>>>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>>>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>>>>> index 74ff2812d66a..bb8e4b641681 100644
>>>>>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
>>>>>>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct
>>> vm_fault
>>>>>>> *vmf)
>>>>>>>> * sure the page protection is write-enabled so we don't get
>>>>>>>> * a lot of unnecessary write faults.
>>>>>>>> */
>>>>>>>> - if (vbo->dirty && vbo->dirty->method ==
>>> VMW_BO_DIRTY_MKWRITE)
>>>>>>>> + if (vbo->dirty && vbo->dirty->method ==
>>> VMW_BO_DIRTY_MKWRITE)
>>>>>>> {
>>>>>>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
>>>>>>>> - else
>>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>>>>>> + num_prefault,
>>>>>>> false);
>>>>>>>> + } else {
>>>>>>>> prot = vm_get_page_prot(vma->vm_flags);
>>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
>>>>>>>> + num_prefault,
>>>>>>> true);
>>>>>>>> + }
>>>>>>>>
>>>>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
>>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
>>>>>>> FAULT_FLAG_RETRY_NOWAIT))
>>>>>>>> return ret;
>>>>>>>>
>>>>>>>> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
>>>>>>>> index 0223a41a64b2..66e293db69ee 100644
>>>>>>>> --- a/include/drm/ttm/ttm_bo.h
>>>>>>>> +++ b/include/drm/ttm/ttm_bo.h
>>>>>>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
>>>>>>> ttm_buffer_object *bo,
>>>>>>>> struct vm_fault *vmf);
>>>>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
>>>>>>>> pgprot_t prot,
>>>>>>>> - pgoff_t num_prefault);
>>>>>>>> + pgoff_t num_prefault,
>>>>>>>> + bool mkwrite);
>>>>>>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
>>>>>>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
>>>>>>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff --git
>>>>>>>> a/include/linux/mm.h b/include/linux/mm.h index
>>>>>>>> f5a97dec5169..f8868e28ea04 100644
>>>>>>>> --- a/include/linux/mm.h
>>>>>>>> +++ b/include/linux/mm.h
>>>>>>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
>>>>> vm_area_struct
>>>>>>> *vma, struct page **pages,
>>>>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>>>>>>>> long
>>>>>>> addr,
>>>>>>>> unsigned long pfn);
>>>>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>>>>>>>> unsigned
>>>>>>> long addr,
>>>>>>>> - unsigned long pfn, pgprot_t pgprot);
>>>>>>>> + unsigned long pfn, pgprot_t pgprot, bool
>>>>>>>> + mkwrite);
>>>>>>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
>>>>>>>> unsigned long
>>>>>>> addr,
>>>>>>>> pfn_t pfn);
>>>>>>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct
>>>>>>>> *vma, diff --git a/mm/memory.c b/mm/memory.c index
>>>>>>>> 7e1f4849463a..2c28f1a349ff
>>>>>>>> 100644
>>>>>>>> --- a/mm/memory.c
>>>>>>>> +++ b/mm/memory.c
>>>>>>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
>>>>>>> vm_area_struct *vma, unsigned long addr,
>>>>>>>> * @addr: target user address of this page
>>>>>>>> * @pfn: source kernel pfn
>>>>>>>> * @pgprot: pgprot flags for the inserted page
>>>>>>>> + * @mkwrite: make the pfn writable
>>>>>>>> *
>>>>>>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
>>>>>>>> * to override pgprot on a per-page basis.
>>>>>>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
>>>>>>> vm_area_struct *vma, unsigned long addr,
>>>>>>>> * Return: vm_fault_t value.
>>>>>>>> */
>>>>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
>>>>>>>> unsigned
>>>>>>> long addr,
>>>>>>>> - unsigned long pfn, pgprot_t pgprot)
>>>>>>>> + unsigned long pfn, pgprot_t pgprot, bool
>>>>>>>> + mkwrite)
>>>>>>>> {
>>>>>>>> /*
>>>>>>>> * Technically, architectures with pte_special can avoid all
>>>>>>>> these @@ -2246,7 +2247,7 @@ vm_fault_t
>>> vmf_insert_pfn_prot(struct
>>>>>>> vm_area_struct *vma, unsigned long addr,
>>>>>>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn,
>>>>>>>> PFN_DEV));
>>>>>>>>
>>>>>>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV),
>>> pgprot,
>>>>>>>> - false);
>>>>>>>> + mkwrite);
>>>>>>>> }
>>>>>>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>>>>>>>
>>>>>>>> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
>>>>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned
>>>>>>>> long
>>>>>>> addr,
>>>>>>>> unsigned long pfn)
>>>>>>>> {
>>>>>>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
>>>>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>>>>>>>> +false);
>>>>>>>> }
>>>>>>>> EXPORT_SYMBOL(vmf_insert_pfn);
>>>>>>>>
>>>>>>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
>>>>>>>> +unsigned
>>>>>>> long addr,
>>>>>>>> + unsigned long pfn) {
>>>>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot,
>>>>>>> true);
>>>>>>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
>>>>>>>> +
>>>>>>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
>>>>>>>> {
>>>>>>>> /* these checks mirror the abort conditions in
>>>>>>>> vm_normal_page */
[AMD Official Use Only - General]
> >>>>>>>> The vmf_insert_pfn_prot could cause unnecessary double faults
> >>>>>>>> on a device pfn. Because currently the vmf_insert_pfn_prot does
> >>>>>>>> not make the pfn writable so the pte entry is normally
> >>>>>>>> read-only or dirty catching.
> >>>>>>> What? How do you got to this conclusion?
> >>>>>> Sorry. I did not mention that this problem only exists on arm64
> platform.
> >>>>> Ok, that makes at least a little bit more sense.
> >>>>>
> >>>>>> Because on arm64 platform the PTE_RDONLY is automatically
> >>>>>> attached to the userspace pte entries even through VM_WRITE +
> VM_SHARE.
> >>>>>> The PTE_RDONLY needs to be cleared in vmf_insert_pfn_prot.
> >>>>>> However vmf_insert_pfn_prot do not make the pte writable passing
> >>>>>> false @mkwrite to insert_pfn.
> >>>>> Question is why is arm64 doing this? As far as I can see they must
> >>>>> have some hardware reason for that.
> >>>>>
> >>>>> The mkwrite parameter to insert_pfn() was added by commit
> >>>>> b2770da642540 to make insert_pfn() look more like insert_pfn_pmd()
> >>>>> so that the DAX code can insert PTEs which are writable and dirty
> >>>>> at the same
> >>> time.
> >>>> This is one scenario to do so. In fact on arm64 there are many
> >>>> scenarios could be to do so. So we can let vmf_insert_pfn_prot
> >>>> supporting @mkwrite for drivers at core layer and let drivers to
> >>>> decide whether or not to make writable and dirty at one time. The
> >>>> patch did this. Otherwise double faults on arm64 when call
> >>> vmf_insert_pfn_prot.
> >>>
> >>> Well, that doesn't answer my question why arm64 is double faulting
> >>> in the first place,.
> >>>
> >>
> >> Eh.
> >>
> >> On arm64 When userspace mmap() with PROT_WRITE and MAP_SHARED
> the
> >> vma->vm_page_prot has the PTE_RDONLY and PTE_WRITE within
> >> PAGE_SHARED_EXEC. (seeing arm64 protection_map)
>
> Well that's your observation, but not the explanation why arm64 is doing this.
>
> See this would have quite some negative impact on performance, not only for
> gfx drivers but in general.
>
> So either the observation is incorrect or there is a *really* good reason why
> arm64 is taking this performance penalty.
>
> >> When write the userspace virtual address the first fault happen and
> >> call into driver's .fault->ttm_bo_vm_fault_reserved->vmf_insert_pfn_prot-
> >insert_pfn.
> >> The insert_pfn will establish the pte entry. However the
> >> vmf_insert_pfn_prot pass false @mkwrite to insert_pfn by default and
> >> so insert_pfn could not make the pfn writable and it do not call
> >> maybe_mkwrite(pte_mkdirty(entry), vma) to clear the PTE_RDONLY bit. So
> the pte entry is actually write protection for mmu.
> >> So when the first fault return and re-execute the store instruction
> >> the second fault happen again. And in second fault it just only do
> >> pte_mkdirty(entry) which clear the PTE_RDONLY.
> > It depends if the ARM64 CPU in question supports hardware dirty bit
> > management (DBM). If that is the case and PTE_DBM (ie. PTE_WRITE) is
> > set HW will automatically clear PTE_RDONLY bit to mark the entry dirty
> > instead of raising a write fault. So you shouldn't see a double fault
> > if PTE_DBM/WRITE is set.
Thanks. This is reasonable. But I still really had the double faults in my project
platform.
> >
> > On ARM64 you can kind of think of PTE_RDONLY as the HW dirty bit and
> > PTE_DBM as the read/write permission bit with SW being responsible for
> > updating PTE_RDONLY via the fault handler if DBM is not supported by HW.
> >
> > At least that's my understanding from having hacked on this in the
> > past. You can see all this weirdness happening in the definitions of
> > pte_dirty() and pte_write() for ARM64.
>
> +1
>
> Thanks a lot for that, this was exactly the information I was looking for.
>
> In this light it makes this patch here look unnecessary and questionable at
> best.
>
> Xianrong if you have an arm64 platform which really double faults (confirmed
> through a debugger for example) then you have to ask why this platform
> shows this behavior and not try to work around it.
>
> Behaviors like those usually have a very very good reason and without a
> confirmed explanation I'm not allowing any patch in which would disable stuff
> like that.
Thanks. You are very right. I found CONFIG_ARM64_HW_AFDBM is not enabled
in my project. So actually PTE_DBM is not hardware bit. It is software bit.
Now i understand why arm64 to do this attaching PTE_RDONLY automatically.
It is compatible for PTE_DBM hardware or not. This answers your question.
So I met double faults in my project when CONFIG_ARM64_HW_AFDBM is false.
However these double faults can be eliminated when i replace the vmf_insert_mixed
to vmf_insert_mixed_mkwrite in drivers under CONFIG_ARM64_HW_AFDBM = false.
The vmf_insert_pfn_prot is similar with vmf_insert_mixed. It should supply @mkwrite
parameter rather than false mkwrite by default.
So i think if you forgot to enable CONFIG_ARM64_HW_AFDBM or before Armv8 hardware
which might not support PTE_DBM this modification is meaningful.
>
> Regards,
> Christian.
>
> >
> >> I think so and hope no wrong.
> >>
> >>> So as long as this isn't sorted out I'm going to reject this patch.
> >>>
> >>> Regards,
> >>> Christian.
> >>>
> >>>>> This is a completely different use case to what you try to use it
> >>>>> here for and that looks extremely fishy to me.
> >>>>>
> >>>>> Regards,
> >>>>> Christian.
> >>>>>
> >>>>>>>> The first fault only sets up the pte entry which actually is
> >>>>>>>> dirty catching. And the second immediate fault to the pfn due
> >>>>>>>> to first dirty catching when the cpu re-execute the store instruction.
> >>>>>>> It could be that this is done to work around some hw behavior,
> >>>>>>> but not because of dirty catching.
> >>>>>>>
> >>>>>>>> Normally if the drivers call vmf_insert_pfn_prot and also
> >>>>>>>> supply 'pfn_mkwrite' callback within vm_operations_struct which
> >>>>>>>> requires the pte to be dirty catching then the
> >>>>>>>> vmf_insert_pfn_prot and the double fault are reasonable. It is not a
> problem.
> >>>>>>> Well, as far as I can see that behavior absolutely doesn't make sense.
> >>>>>>>
> >>>>>>> When pfn_mkwrite is requested then the driver should use
> >>>>>>> PAGE_COPY, which is exactly what VMWGFX (the only driver using
> >>>>>>> dirty tracking) is
> >>>>> doing.
> >>>>>>> Everybody else uses PAGE_SHARED which should make the pte
> >>>>>>> writeable immediately.
> >>>>>>>
> >>>>>>> Regards,
> >>>>>>> Christian.
> >>>>>>>
> >>>>>>>> However the most of drivers calling vmf_insert_pfn_prot do not
> >>>>>>>> supply the 'pfn_mkwrite' callback so that the second fault is
> >>> unnecessary.
> >>>>>>>> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite
> >>>>>>>> pair, we should also supply vmf_insert_pfn_mkwrite for drivers as
> well.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Xianrong Zhou <[email protected]>
> >>>>>>>> ---
> >>>>>>>> arch/x86/entry/vdso/vma.c | 3 ++-
> >>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> >>>>>>>> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> >>>>>>>> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> >>>>>>>> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> >>>>>>>> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> >>>>>>>> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> >>>>>>>> include/drm/ttm/ttm_bo.h | 3 ++-
> >>>>>>>> include/linux/mm.h | 2 +-
> >>>>>>>> mm/memory.c | 14 +++++++++++---
> >>>>>>>> 10 files changed, 30 insertions(+), 16 deletions(-)
> >>>>>>>>
> >>>>>>>> diff --git a/arch/x86/entry/vdso/vma.c
> >>>>>>>> b/arch/x86/entry/vdso/vma.c index
> 7645730dc228..dd2431c2975f
> >>>>>>>> 100644
> >>>>>>>> --- a/arch/x86/entry/vdso/vma.c
> >>>>>>>> +++ b/arch/x86/entry/vdso/vma.c
> >>>>>>>> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct
> >>>>>>> vm_special_mapping *sm,
> >>>>>>>> if (pvti &&
> >>>>>>>> vclock_was_used(VDSO_CLOCKMODE_PVCLOCK))
> >>>>>>> {
> >>>>>>>> return vmf_insert_pfn_prot(vma, vmf->address,
> >>>>>>>> __pa(pvti) >> PAGE_SHIFT,
> >>>>>>>> - pgprot_decrypted(vma-
> >>>>>>>> vm_page_prot));
> >>>>>>>> + pgprot_decrypted(vma-
> >>>>>>>> vm_page_prot),
> >>>>>>>> + true);
> >>>>>>>> }
> >>>>>>>> } else if (sym_offset == image->sym_hvclock_page) {
> >>>>>>>> pfn = hv_get_tsc_pfn(); diff --git
> >>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>>>>> index 49a5f1c73b3e..adcb20d9e624 100644
> >>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> >>>>>>>> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct
> >>>>> vm_fault
> >>>>>>> *vmf)
> >>>>>>>> }
> >>>>>>>>
> >>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>>>>>>> vm_page_prot,
> >>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>>>>> +
> >>>>>>>> + TTM_BO_VM_NUM_PREFAULT,
> >>>>>>> true);
> >>>>>>>> drm_dev_exit(idx);
> >>>>>>>> } else {
> >>>>>>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>>>>> b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>>>>> index 9227f8146a58..c6f13ae6c308 100644
> >>>>>>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>>>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> >>>>>>>> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct
> >>>>>>>> vm_fault
> >>>>>>>> *vmf)
> >>>>>>>>
> >>>>>>>> if (drm_dev_enter(dev, &idx)) {
> >>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >>>>>>>> vm_page_prot,
> >>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>>>>> +
> >>>>>>>> + TTM_BO_VM_NUM_PREFAULT,
> >>>>>>> true);
> >>>>>>>> drm_dev_exit(idx);
> >>>>>>>> } else {
> >>>>>>>> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma-
> >>>>>>>> vm_page_prot); diff --git
> >>>>>>>> a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>>>>> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>>>>> index 49c2bcbef129..7e1453762ec9 100644
> >>>>>>>> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> >>>>>>>> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct
> >>>>>>>> vm_fault
> >>>>>>>> *vmf)
> >>>>>>>>
> >>>>>>>> nouveau_bo_del_io_reserve_lru(bo);
> >>>>>>>> prot = vm_get_page_prot(vma->vm_flags);
> >>>>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>>>> TTM_BO_VM_NUM_PREFAULT);
> >>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>>>> TTM_BO_VM_NUM_PREFAULT,
> >>>>>>>> +true);
> >>>>>>>> nouveau_bo_add_io_reserve_lru(bo);
> >>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>>>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>>>>> return ret;
> >>>>>>>> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>>>>> b/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>>>>> index 3fec3acdaf28..b21cf00ae162 100644
> >>>>>>>> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>>>>> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> >>>>>>>> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct
> >>>>>>>> vm_fault
> >>>>>>> *vmf)
> >>>>>>>> goto unlock_resv;
> >>>>>>>>
> >>>>>>>> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma-
> >vm_page_prot,
> >>>>>>>> - TTM_BO_VM_NUM_PREFAULT);
> >>>>>>>> + TTM_BO_VM_NUM_PREFAULT,
> >>>>>>>> + true);
> >>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>>>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>>>>> goto unlock_mclk;
> >>>>>>>>
> >>>>>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>>>>> b/drivers/gpu/drm/ttm/ttm_bo_vm.c index
> >>>>>>> 4212b8c91dd4..7d14a7d267aa
> >>>>>>>> 100644
> >>>>>>>> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> >>>>>>>> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>>>>>>> * @num_prefault: Maximum number of prefault pages. The
> >>>>>>>> caller may
> >>>>>>> want to
> >>>>>>>> * specify this based on madvice settings and the size of
> >>>>>>>> the GPU
> >>> object
> >>>>>>>> * backed by the memory.
> >>>>>>>> + * @mkwrite: make the pfn or page writable
> >>>>>>>> *
> >>>>>>>> * This function inserts one or more page table entries pointing to
> the
> >>>>>>>> * memory backing the buffer object, and then returns a
> >>>>>>>> return code @@ -180,7 +181,8 @@
> EXPORT_SYMBOL(ttm_bo_vm_reserve);
> >>>>>>>> */
> >>>>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>>>>>>> pgprot_t prot,
> >>>>>>>> - pgoff_t num_prefault)
> >>>>>>>> + pgoff_t num_prefault,
> >>>>>>>> + bool mkwrite)
> >>>>>>>> {
> >>>>>>>> struct vm_area_struct *vma = vmf->vma;
> >>>>>>>> struct ttm_buffer_object *bo = vma->vm_private_data; @@
> >>>>>>>> -263,7
> >>>>>>>> +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault
> >>>>>>>> +*vmf,
> >>>>>>>> * at arbitrary times while the data is mmap'ed.
> >>>>>>>> * See vmf_insert_pfn_prot() for a discussion.
> >>>>>>>> */
> >>>>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>>>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>>>>>>> + mkwrite);
> >>>>>>>>
> >>>>>>>> /* Never error on prefaulted PTEs */
> >>>>>>>> if (unlikely((ret & VM_FAULT_ERROR))) { @@
> >>>>>>>> -312,7
> >>>>>>>> +314,7
> >>>>>>> @@
> >>>>>>>> vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf,
> pgprot_t
> >>>>> prot)
> >>>>>>>> /* Prefault the entire VMA range right away to avoid further
> faults */
> >>>>>>>> for (address = vma->vm_start; address < vma->vm_end;
> >>>>>>>> address += PAGE_SIZE)
> >>>>>>>> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> >>>>>>>> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot,
> >>>>>>>> + true);
> >>>>>>>>
> >>>>>>>> return ret;
> >>>>>>>> }
> >>>>>>>> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>>>>> b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>>>>> index 74ff2812d66a..bb8e4b641681 100644
> >>>>>>>> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>>>>> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> >>>>>>>> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct
> >>> vm_fault
> >>>>>>> *vmf)
> >>>>>>>> * sure the page protection is write-enabled so we don't get
> >>>>>>>> * a lot of unnecessary write faults.
> >>>>>>>> */
> >>>>>>>> - if (vbo->dirty && vbo->dirty->method ==
> >>> VMW_BO_DIRTY_MKWRITE)
> >>>>>>>> + if (vbo->dirty && vbo->dirty->method ==
> >>> VMW_BO_DIRTY_MKWRITE)
> >>>>>>> {
> >>>>>>>> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> >>>>>>>> - else
> >>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>>>>> + num_prefault,
> >>>>>>> false);
> >>>>>>>> + } else {
> >>>>>>>> prot = vm_get_page_prot(vma->vm_flags);
> >>>>>>>> + ret = ttm_bo_vm_fault_reserved(vmf, prot,
> >>>>>>>> + num_prefault,
> >>>>>>> true);
> >>>>>>>> + }
> >>>>>>>>
> >>>>>>>> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> >>>>>>>> if (ret == VM_FAULT_RETRY && !(vmf->flags &
> >>>>>>> FAULT_FLAG_RETRY_NOWAIT))
> >>>>>>>> return ret;
> >>>>>>>>
> >>>>>>>> diff --git a/include/drm/ttm/ttm_bo.h
> >>>>>>>> b/include/drm/ttm/ttm_bo.h index
> 0223a41a64b2..66e293db69ee
> >>>>>>>> 100644
> >>>>>>>> --- a/include/drm/ttm/ttm_bo.h
> >>>>>>>> +++ b/include/drm/ttm/ttm_bo.h
> >>>>>>>> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct
> >>>>>>> ttm_buffer_object *bo,
> >>>>>>>> struct vm_fault *vmf);
> >>>>>>>> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> >>>>>>>> pgprot_t prot,
> >>>>>>>> - pgoff_t num_prefault);
> >>>>>>>> + pgoff_t num_prefault,
> >>>>>>>> + bool mkwrite);
> >>>>>>>> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> >>>>>>>> void ttm_bo_vm_open(struct vm_area_struct *vma);
> >>>>>>>> void ttm_bo_vm_close(struct vm_area_struct *vma); diff
> >>>>>>>> --git a/include/linux/mm.h b/include/linux/mm.h index
> >>>>>>>> f5a97dec5169..f8868e28ea04 100644
> >>>>>>>> --- a/include/linux/mm.h
> >>>>>>>> +++ b/include/linux/mm.h
> >>>>>>>> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct
> >>>>> vm_area_struct
> >>>>>>> *vma, struct page **pages,
> >>>>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
> >>>>>>>> unsigned long
> >>>>>>> addr,
> >>>>>>>> unsigned long pfn);
> >>>>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>>>>>>> unsigned
> >>>>>>> long addr,
> >>>>>>>> - unsigned long pfn, pgprot_t pgprot);
> >>>>>>>> + unsigned long pfn, pgprot_t pgprot, bool
> >>>>>>>> + mkwrite);
> >>>>>>>> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
> >>>>>>>> unsigned long
> >>>>>>> addr,
> >>>>>>>> pfn_t pfn);
> >>>>>>>> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct
> >>>>>>>> *vma, diff --git a/mm/memory.c b/mm/memory.c index
> >>>>>>>> 7e1f4849463a..2c28f1a349ff
> >>>>>>>> 100644
> >>>>>>>> --- a/mm/memory.c
> >>>>>>>> +++ b/mm/memory.c
> >>>>>>>> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct
> >>>>>>> vm_area_struct *vma, unsigned long addr,
> >>>>>>>> * @addr: target user address of this page
> >>>>>>>> * @pfn: source kernel pfn
> >>>>>>>> * @pgprot: pgprot flags for the inserted page
> >>>>>>>> + * @mkwrite: make the pfn writable
> >>>>>>>> *
> >>>>>>>> * This is exactly like vmf_insert_pfn(), except that it allows drivers
> >>>>>>>> * to override pgprot on a per-page basis.
> >>>>>>>> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct
> >>>>>>> vm_area_struct *vma, unsigned long addr,
> >>>>>>>> * Return: vm_fault_t value.
> >>>>>>>> */
> >>>>>>>> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma,
> >>>>>>>> unsigned
> >>>>>>> long addr,
> >>>>>>>> - unsigned long pfn, pgprot_t pgprot)
> >>>>>>>> + unsigned long pfn, pgprot_t pgprot, bool
> >>>>>>>> + mkwrite)
> >>>>>>>> {
> >>>>>>>> /*
> >>>>>>>> * Technically, architectures with pte_special can avoid
> >>>>>>>> all these @@ -2246,7 +2247,7 @@ vm_fault_t
> >>> vmf_insert_pfn_prot(struct
> >>>>>>> vm_area_struct *vma, unsigned long addr,
> >>>>>>>> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn,
> >>>>>>>> PFN_DEV));
> >>>>>>>>
> >>>>>>>> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn,
> >>>>>>>> PFN_DEV),
> >>> pgprot,
> >>>>>>>> - false);
> >>>>>>>> + mkwrite);
> >>>>>>>> }
> >>>>>>>> EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>>>>>>>
> >>>>>>>> @@ -2273,10 +2274,17 @@
> EXPORT_SYMBOL(vmf_insert_pfn_prot);
> >>>>>>>> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
> >>>>>>>> unsigned long
> >>>>>>> addr,
> >>>>>>>> unsigned long pfn)
> >>>>>>>> {
> >>>>>>>> - return vmf_insert_pfn_prot(vma, addr, pfn, vma-
> >vm_page_prot);
> >>>>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn,
> >>>>>>>> +vma->vm_page_prot, false);
> >>>>>>>> }
> >>>>>>>> EXPORT_SYMBOL(vmf_insert_pfn);
> >>>>>>>>
> >>>>>>>> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma,
> >>>>>>>> +unsigned
> >>>>>>> long addr,
> >>>>>>>> + unsigned long pfn) {
> >>>>>>>> + return vmf_insert_pfn_prot(vma, addr, pfn,
> >>>>>>>> + vma->vm_page_prot,
> >>>>>>> true);
> >>>>>>>> +} EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> >>>>>>>> +
> >>>>>>>> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> >>>>>>>> {
> >>>>>>>> /* these checks mirror the abort conditions in
> >>>>>>>> vm_normal_page */