This patchset allows KVM to map huge pages for DAX-backed files.
v4 -> v5:
v4: https://lore.kernel.org/lkml/[email protected]/
- Rebased onto kvm/queue
- Removed the switch statement and fixed PUD_SIZE; just use
dev_pagemap_mapping_shift() > PAGE_SHIFT
- Added explanation of parameter changes to patch 1's commit message
v3 -> v4:
v3: https://lore.kernel.org/lkml/[email protected]/
- Rebased onto linus/master
v2 -> v3:
v2: https://lore.kernel.org/lkml/[email protected]/
- Updated Acks/Reviewed-by
- Rebased onto linux-next
v1 -> v2:
https://lore.kernel.org/lkml/[email protected]/
- Updated Acks/Reviewed-by
- Minor touchups
- Added patch to remove redundant PageReserved() check
- Rebased onto linux-next
RFC/discussion thread:
https://lore.kernel.org/lkml/[email protected]/
Barret Rhoden (2):
mm: make dev_pagemap_mapping_shift() externally visible
kvm: Use huge pages for DAX-backed files
arch/x86/kvm/mmu/mmu.c | 31 +++++++++++++++++++++++++++----
include/linux/mm.h | 3 +++
mm/memory-failure.c | 38 +++-----------------------------------
mm/util.c | 34 ++++++++++++++++++++++++++++++++++
4 files changed, 67 insertions(+), 39 deletions(-)
--
2.24.0.525.g8f36a354ae-goog
KVM has a use case for determining the size of a dax mapping.
The KVM code has easy access to the address and the mm, and
dev_pagemap_mapping_shift() needs only those parameters. It was
deriving them from page and vma. This commit changes those parameters
from (page, vma) to (address, mm).
Signed-off-by: Barret Rhoden <[email protected]>
Reviewed-by: David Hildenbrand <[email protected]>
Acked-by: Dan Williams <[email protected]>
---
include/linux/mm.h | 3 +++
mm/memory-failure.c | 38 +++-----------------------------------
mm/util.c | 34 ++++++++++++++++++++++++++++++++++
3 files changed, 40 insertions(+), 35 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2adf95b3f9c..bfd1882dd5c6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1013,6 +1013,9 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
#define page_ref_zero_or_close_to_overflow(page) \
((unsigned int) page_ref_count(page) + 127u <= 127u)
+unsigned long dev_pagemap_mapping_shift(unsigned long address,
+ struct mm_struct *mm);
+
static inline void get_page(struct page *page)
{
page = compound_head(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87dff73..bafa464c8290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -261,40 +261,6 @@ void shake_page(struct page *p, int access)
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
-{
- unsigned long address = vma_address(page, vma);
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return 0;
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
- return 0;
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return 0;
- if (pud_devmap(*pud))
- return PUD_SHIFT;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return 0;
- if (pmd_devmap(*pmd))
- return PMD_SHIFT;
- pte = pte_offset_map(pmd, address);
- if (!pte_present(*pte))
- return 0;
- if (pte_devmap(*pte))
- return PAGE_SHIFT;
- return 0;
-}
-
/*
* Failure handling: if we can't find or can't kill a process there's
* not much we can do. We just print a message and ignore otherwise.
@@ -324,7 +290,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+ tk->size_shift =
+ dev_pagemap_mapping_shift(vma_address(page, vma),
+ vma->vm_mm);
else
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
diff --git a/mm/util.c b/mm/util.c
index 3ad6db9a722e..59984e6b40ab 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -901,3 +901,37 @@ int memcmp_pages(struct page *page1, struct page *page2)
kunmap_atomic(addr1);
return ret;
}
+
+unsigned long dev_pagemap_mapping_shift(unsigned long address,
+ struct mm_struct *mm)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return 0;
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return 0;
+ if (pud_devmap(*pud))
+ return PUD_SHIFT;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+ if (pmd_devmap(*pmd))
+ return PMD_SHIFT;
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if (pte_devmap(*pte))
+ return PAGE_SHIFT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
--
2.24.0.525.g8f36a354ae-goog
This change allows KVM to map DAX-backed files made of huge pages with
huge mappings in the EPT/TDP.
DAX pages are not PageTransCompound. The existing check is trying to
determine if the mapping for the pfn is a huge mapping or not. For
non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
For DAX, we can check the page table itself.
Note that KVM already faulted in the page (or huge page) in the host's
page table, and we hold the KVM mmu spinlock. We grabbed that lock in
kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
Signed-off-by: Barret Rhoden <[email protected]>
---
arch/x86/kvm/mmu/mmu.c | 31 +++++++++++++++++++++++++++----
1 file changed, 27 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7269130ea5e2..ea8f6951398b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3328,6 +3328,30 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
+static bool pfn_is_huge_mapped(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ unsigned long hva;
+
+ if (!is_zone_device_page(page))
+ return PageTransCompoundMap(page);
+
+ /*
+ * DAX pages do not use compound pages. The page should have already
+ * been mapped into the host-side page table during try_async_pf(), so
+ * we can check the page tables directly.
+ */
+ hva = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(hva))
+ return false;
+
+ /*
+ * Our caller grabbed the KVM mmu_lock with a successful
+ * mmu_notifier_retry, so we're safe to walk the page table.
+ */
+ return dev_pagemap_mapping_shift(hva, current->mm) > PAGE_SHIFT;
+}
+
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
gfn_t gfn, kvm_pfn_t *pfnp,
int *levelp)
@@ -3342,8 +3366,8 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* here.
*/
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ level == PT_PAGE_TABLE_LEVEL &&
+ pfn_is_huge_mapped(vcpu->kvm, gfn, pfn)) {
unsigned long mask;
/*
@@ -5957,8 +5981,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
- !kvm_is_zone_device_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ pfn_is_huge_mapped(kvm, sp->gfn, pfn)) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
--
2.24.0.525.g8f36a354ae-goog
> On 12 Dec 2019, at 20:22, Barret Rhoden <[email protected]> wrote:
>
> This change allows KVM to map DAX-backed files made of huge pages with
> huge mappings in the EPT/TDP.
This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
See how FNAME(page_fault)() calls transparent_hugepage_adjust().
>
> DAX pages are not PageTransCompound. The existing check is trying to
> determine if the mapping for the pfn is a huge mapping or not.
I would rephrase “The existing check is trying to determine if the pfn
is mapped as part of a transparent huge-page”.
> For
> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
This is not related to hugetlbfs but rather THP.
> For DAX, we can check the page table itself.
>
> Note that KVM already faulted in the page (or huge page) in the host's
> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>
> Signed-off-by: Barret Rhoden <[email protected]>
I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
which is meant to handle PFNs that are mapped as part of a transparent huge-page.
For example, this would prevent mapping DAX-backed file page as 1GB.
As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
As you are parsing the page-tables to discover the page-size the PFN is mapped in,
I think you should instead modify kvm_host_page_size() to parse page-tables instead
of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
of is_zone_device_page().
The main complication though of doing this is that at this point you don’t yet have the PFN
that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
in tdp_page_fault() & FNAME(page_fault)().
-Liran
> ---
> arch/x86/kvm/mmu/mmu.c | 31 +++++++++++++++++++++++++++----
> 1 file changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7269130ea5e2..ea8f6951398b 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3328,6 +3328,30 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> __direct_pte_prefetch(vcpu, sp, sptep);
> }
>
> +static bool pfn_is_huge_mapped(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn)
> +{
> + struct page *page = pfn_to_page(pfn);
> + unsigned long hva;
> +
> + if (!is_zone_device_page(page))
> + return PageTransCompoundMap(page);
> +
> + /*
> + * DAX pages do not use compound pages. The page should have already
> + * been mapped into the host-side page table during try_async_pf(), so
> + * we can check the page tables directly.
> + */
> + hva = gfn_to_hva(kvm, gfn);
> + if (kvm_is_error_hva(hva))
> + return false;
> +
> + /*
> + * Our caller grabbed the KVM mmu_lock with a successful
> + * mmu_notifier_retry, so we're safe to walk the page table.
> + */
> + return dev_pagemap_mapping_shift(hva, current->mm) > PAGE_SHIFT;
> +}
> +
> static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
> gfn_t gfn, kvm_pfn_t *pfnp,
> int *levelp)
> @@ -3342,8 +3366,8 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
> * here.
> */
> if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
> - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
> - PageTransCompoundMap(pfn_to_page(pfn))) {
> + level == PT_PAGE_TABLE_LEVEL &&
> + pfn_is_huge_mapped(vcpu->kvm, gfn, pfn)) {
> unsigned long mask;
>
> /*
> @@ -5957,8 +5981,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
> * mapping if the indirect sp has level = 1.
> */
> if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
> - !kvm_is_zone_device_pfn(pfn) &&
> - PageTransCompoundMap(pfn_to_page(pfn))) {
> + pfn_is_huge_mapped(kvm, sp->gfn, pfn)) {
> pte_list_remove(rmap_head, sptep);
>
> if (kvm_available_flush_tlb_with_range())
> --
> 2.24.0.525.g8f36a354ae-goog
>
> On 12 Dec 2019, at 20:47, Liran Alon <[email protected]> wrote:
>
>
>
>> On 12 Dec 2019, at 20:22, Barret Rhoden <[email protected]> wrote:
>>
>> This change allows KVM to map DAX-backed files made of huge pages with
>> huge mappings in the EPT/TDP.
>
> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
> See how FNAME(page_fault)() calls transparent_hugepage_adjust().
>
>>
>> DAX pages are not PageTransCompound. The existing check is trying to
>> determine if the mapping for the pfn is a huge mapping or not.
>
> I would rephrase “The existing check is trying to determine if the pfn
> is mapped as part of a transparent huge-page”.
>
>> For
>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
>
> This is not related to hugetlbfs but rather THP.
>
>> For DAX, we can check the page table itself.
>>
>> Note that KVM already faulted in the page (or huge page) in the host's
>> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>
>> Signed-off-by: Barret Rhoden <[email protected]>
>
> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
>
> For example, this would prevent mapping DAX-backed file page as 1GB.
> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
>
> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
> I think you should instead modify kvm_host_page_size() to parse page-tables instead
> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
> of is_zone_device_page().
> The main complication though of doing this is that at this point you don’t yet have the PFN
> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
> in tdp_page_fault() & FNAME(page_fault)().
>
> -Liran
Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
That is probably easier and more elegant.
-Liran
Hi -
On 12/12/19 1:49 PM, Liran Alon wrote:
>
>
>> On 12 Dec 2019, at 20:47, Liran Alon <[email protected]> wrote:
>>
>>
>>
>>> On 12 Dec 2019, at 20:22, Barret Rhoden <[email protected]> wrote:
>>>
>>> This change allows KVM to map DAX-backed files made of huge pages with
>>> huge mappings in the EPT/TDP.
>>
>> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
>> See how FNAME(page_fault)() calls transparent_hugepage_adjust().
Cool, I'll drop references to the EPT/TDP from the commit message.
>>> DAX pages are not PageTransCompound. The existing check is trying to
>>> determine if the mapping for the pfn is a huge mapping or not.
>>
>> I would rephrase “The existing check is trying to determine if the pfn
>> is mapped as part of a transparent huge-page”.
Can do.
>>
>>> For
>>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
>>
>> This is not related to hugetlbfs but rather THP.
I thought that PageTransCompound also returned true for hugetlbfs (based
off of comments in page-flags.h). Though I do see the comment about the
'level == PT_PAGE_TABLE_LEVEL' check excluding hugetlbfs pages.
Anyway, I'll remove the "e.g. hugetlbfs" from the commit message.
>>
>>> For DAX, we can check the page table itself.
>>>
>>> Note that KVM already faulted in the page (or huge page) in the host's
>>> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>
>>> Signed-off-by: Barret Rhoden <[email protected]>
>>
>> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
>> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
>>
>> For example, this would prevent mapping DAX-backed file page as 1GB.
>> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
>>
>> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
>> I think you should instead modify kvm_host_page_size() to parse page-tables instead
>> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
>> of is_zone_device_page().
>> The main complication though of doing this is that at this point you don’t yet have the PFN
>> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
>> in tdp_page_fault() & FNAME(page_fault)().
>>
>> -Liran
>
> Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
> to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
> That is probably easier and more elegant.
I can rename it to hugepage_adjust(), since it's not just THP anymore.
I was a little hesitant to change the this to handle 1 GB pages with
this patchset at first. I didn't want to break the non-DAX case stuff
by doing so.
Specifically, can a THP page be 1 GB, and if so, how can you tell? If
you can't tell easily, I could walk the page table for all cases,
instead of just zone_device().
I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think,
which would open this up to hugetlbfs pages (based on the comments). Is
there any reason why that would be a bad idea?
Thanks,
Barret
> On 12 Dec 2019, at 21:55, Barret Rhoden <[email protected]> wrote:
>
> Hi -
>
> On 12/12/19 1:49 PM, Liran Alon wrote:
>>> On 12 Dec 2019, at 20:47, Liran Alon <[email protected]> wrote:
>>>
>>>
>>>
>>>> On 12 Dec 2019, at 20:22, Barret Rhoden <[email protected]> wrote:
>>>>
>>>> This change allows KVM to map DAX-backed files made of huge pages with
>>>> huge mappings in the EPT/TDP.
>>>
>>> This change isn’t only relevant for TDP. It also affects when KVM use shadow-paging.
>>> See how FNAME(page_fault)() calls transparent_hugepage_adjust().
>
> Cool, I'll drop references to the EPT/TDP from the commit message.
>
>>>> DAX pages are not PageTransCompound. The existing check is trying to
>>>> determine if the mapping for the pfn is a huge mapping or not.
>>>
>>> I would rephrase “The existing check is trying to determine if the pfn
>>> is mapped as part of a transparent huge-page”.
>
> Can do.
>
>>>
>>>> For
>>>> non-DAX maps, e.g. hugetlbfs, that means checking PageTransCompound.
>>>
>>> This is not related to hugetlbfs but rather THP.
>
> I thought that PageTransCompound also returned true for hugetlbfs (based off of comments in page-flags.h). Though I do see the comment about the 'level == PT_PAGE_TABLE_LEVEL' check excluding hugetlbfs pages.
>
> Anyway, I'll remove the "e.g. hugetlbfs" from the commit message.
>
>>>
>>>> For DAX, we can check the page table itself.
>>>>
>>>> Note that KVM already faulted in the page (or huge page) in the host's
>>>> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
>>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>>
>>>> Signed-off-by: Barret Rhoden <[email protected]>
>>>
>>> I don’t think the right place to change for this functionality is transparent_hugepage_adjust()
>>> which is meant to handle PFNs that are mapped as part of a transparent huge-page.
>>>
>>> For example, this would prevent mapping DAX-backed file page as 1GB.
>>> As transparent_hugepage_adjust() only handles the case (level == PT_PAGE_TABLE_LEVEL).
>>>
>>> As you are parsing the page-tables to discover the page-size the PFN is mapped in,
>>> I think you should instead modify kvm_host_page_size() to parse page-tables instead
>>> of rely on vma_kernel_pagesize() (Which relies on vma->vm_ops->pagesize()) in case
>>> of is_zone_device_page().
>>> The main complication though of doing this is that at this point you don’t yet have the PFN
>>> that is retrieved by try_async_pf(). So maybe you should consider modifying the order of calls
>>> in tdp_page_fault() & FNAME(page_fault)().
>>>
>>> -Liran
>> Or alternatively when thinking about it more, maybe just rename transparent_hugepage_adjust()
>> to not be specific to THP and better handle the case of parsing page-tables changing mapping-level to 1GB.
>> That is probably easier and more elegant.
>
> I can rename it to hugepage_adjust(), since it's not just THP anymore.
Sounds good.
>
> I was a little hesitant to change the this to handle 1 GB pages with this patchset at first. I didn't want to break the non-DAX case stuff by doing so.
Why would it affect non-DAX case?
Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
i.e. THP merged pages should still be detected by PageTransCompoundMap().
>
> Specifically, can a THP page be 1 GB, and if so, how can you tell? If you can't tell easily, I could walk the page table for all cases, instead of just zone_device().
I prefer to walk page-tables only for is_zone_device_page().
>
> I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think, which would open this up to hugetlbfs pages (based on the comments). Is there any reason why that would be a bad idea?
KVM already supports mapping 1GB hugetlbfs pages. As level is set to PUD-level by tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize(). As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where hugetlb_vm_op_pagesize() will return appropriate page-size.
Specifically, I don’t think THP ever merges small pages to 1GB pages. I think this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in the case of !is_zone_device_page().
-Liran
>
> Thanks,
>
> Barret
>
On 12/12/19 8:07 PM, Liran Alon wrote:
>> I was a little hesitant to change the this to handle 1 GB pages with this patchset at first. I didn't want to break the non-DAX case stuff by doing so.
>
> Why would it affect non-DAX case?
> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
> i.e. THP merged pages should still be detected by PageTransCompoundMap().
That's what I already do. But if I wanted to make the hugepage_adjust()
function also handle the change to 1 GB, then that code would apply to
THP too. I didn't want to do that without knowing the implications for THP.
>> Specifically, can a THP page be 1 GB, and if so, how can you tell? If you can't tell easily, I could walk the page table for all cases, instead of just zone_device().
>
> I prefer to walk page-tables only for is_zone_device_page().
Is there another way to tell if a THP page is 1 GB? Anyway, this is the
sort of stuff I didn't want to mess around with.
hugepage_adjust() seemed like a reasonable place to get a huge (2MB)
page table entry out of a DAX mapping. I didn't want to proliferate
another special case for upgrading to a larger PTE size (i.e. how
hugetlbfs and THP have separate mechanisms), so I hopped on to the "can
we do a 2MB mapping even though host_mapping_level() didn't say so" case
- which is my interpretation of what huge_adjust() is for.
Barret
On Fri, Dec 13, 2019 at 03:07:31AM +0200, Liran Alon wrote:
>
> > On 12 Dec 2019, at 21:55, Barret Rhoden <[email protected]> wrote:
> >
> >>>> Note that KVM already faulted in the page (or huge page) in the host's
> >>>> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
> >>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
> >>>>
> >>>> Signed-off-by: Barret Rhoden <[email protected]>
> >>>
> >>> I don’t think the right place to change for this functionality is
> >>> transparent_hugepage_adjust() which is meant to handle PFNs that are
> >>> mapped as part of a transparent huge-page.
> >>>
> >>> For example, this would prevent mapping DAX-backed file page as 1GB. As
> >>> transparent_hugepage_adjust() only handles the case (level ==
> >>> PT_PAGE_TABLE_LEVEL).
Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing. It's
unlikely THP itself will support 1GB pages any time soon, but having the
logic there wouldn't hurt anything.
> >>> As you are parsing the page-tables to discover the page-size the PFN is
> >>> mapped in, I think you should instead modify kvm_host_page_size() to
> >>> parse page-tables instead of rely on vma_kernel_pagesize() (Which relies
> >>> on vma->vm_ops->pagesize()) in case of is_zone_device_page().
> >>>
> >>> The main complication though of doing this is that at this point you
> >>> don’t yet have the PFN that is retrieved by try_async_pf(). So maybe you
> >>> should consider modifying the order of calls in tdp_page_fault() &
> >>> FNAME(page_fault)().
> >>>
> >>> -Liran
> >> Or alternatively when thinking about it more, maybe just rename
> >> transparent_hugepage_adjust() to not be specific to THP and better handle
> >> the case of parsing page-tables changing mapping-level to 1GB.
> >> That is probably easier and more elegant.
Agreed.
> > I can rename it to hugepage_adjust(), since it's not just THP anymore.
Or maybe allowed_hugepage_adjust()? To pair with disallowed_hugepage_adjust(),
which adjusts KVM's page size in the opposite direction to avoid the iTLB
multi-hit issue.
>
> Sounds good.
>
> >
> > I was a little hesitant to change the this to handle 1 GB pages with this
> > patchset at first. I didn't want to break the non-DAX case stuff by doing
> > so.
>
> Why would it affect non-DAX case?
> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
> i.e. THP merged pages should still be detected by PageTransCompoundMap().
I think what Barret is saying is that teaching thp_adjust() how to do 1gb
mappings would naturally affect the code path for THP pages. But I agree
that it would be superficial.
> > Specifically, can a THP page be 1 GB, and if so, how can you tell? If you
> > can't tell easily, I could walk the page table for all cases, instead of
> > just zone_device().
No, THP doesn't currently support 1gb pages. Expliciting returning
PMD_SIZE on PageTransCompoundMap() would be a good thing from a readability
perspective.
> I prefer to walk page-tables only for is_zone_device_page().
>
> >
> > I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think,
> > which would open this up to hugetlbfs pages (based on the comments). Is
> > there any reason why that would be a bad idea?
No, the "level == PT_PAGE_TABLE_LEVEL" check is to filter out the case
where KVM is already planning on using a large page, e.g. when the memory
is backed by hugetlbs.
> KVM already supports mapping 1GB hugetlbfs pages. As level is set to
> PUD-level by
> tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize().
> As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where
> hugetlb_vm_op_pagesize() will return appropriate page-size.
>
> Specifically, I don’t think THP ever merges small pages to 1GB pages. I think
> this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only
> in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in
> the case of !is_zone_device_page().
I would add 1gb support for DAX as a third patch in this series. To pave
the way in patch 2/2, change it to replace "bool pfn_is_huge_mapped()" with
"int host_pfn_mapping_level()", and maybe also renaming host_mapping_level()
to host_vma_mapping_level() to avoid confusion.
Then allowed_hugepage_adjust() would look something like:
static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_pfn_t *pfnp, int *levelp, int max_level)
{
kvm_pfn_t pfn = *pfnp;
int level = *levelp;
unsigned long mask;
if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
level == PT_PAGE_TABLE_LEVEL)
return;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
* the pmd/pud can't be split from under us.
*/
level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
*levelp = level = min(level, max_level);
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
*pfnp = pfn & ~mask;
}
> On 13 Dec 2019, at 19:19, Sean Christopherson <[email protected]> wrote:
>
> On Fri, Dec 13, 2019 at 03:07:31AM +0200, Liran Alon wrote:
>>
>>> On 12 Dec 2019, at 21:55, Barret Rhoden <[email protected]> wrote:
>>>
>>>>>> Note that KVM already faulted in the page (or huge page) in the host's
>>>>>> page table, and we hold the KVM mmu spinlock. We grabbed that lock in
>>>>>> kvm_mmu_notifier_invalidate_range_end, before checking the mmu seq.
>>>>>>
>>>>>> Signed-off-by: Barret Rhoden <[email protected]>
>>>>>
>>>>> I don’t think the right place to change for this functionality is
>>>>> transparent_hugepage_adjust() which is meant to handle PFNs that are
>>>>> mapped as part of a transparent huge-page.
>>>>>
>>>>> For example, this would prevent mapping DAX-backed file page as 1GB. As
>>>>> transparent_hugepage_adjust() only handles the case (level ==
>>>>> PT_PAGE_TABLE_LEVEL).
>
> Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing. It's
> unlikely THP itself will support 1GB pages any time soon, but having the
> logic there wouldn't hurt anything.
I agree.
>
>>>>> As you are parsing the page-tables to discover the page-size the PFN is
>>>>> mapped in, I think you should instead modify kvm_host_page_size() to
>>>>> parse page-tables instead of rely on vma_kernel_pagesize() (Which relies
>>>>> on vma->vm_ops->pagesize()) in case of is_zone_device_page().
>>>>>
>>>>> The main complication though of doing this is that at this point you
>>>>> don’t yet have the PFN that is retrieved by try_async_pf(). So maybe you
>>>>> should consider modifying the order of calls in tdp_page_fault() &
>>>>> FNAME(page_fault)().
>>>>>
>>>>> -Liran
>>>> Or alternatively when thinking about it more, maybe just rename
>>>> transparent_hugepage_adjust() to not be specific to THP and better handle
>>>> the case of parsing page-tables changing mapping-level to 1GB.
>>>> That is probably easier and more elegant.
>
> Agreed.
>
>>> I can rename it to hugepage_adjust(), since it's not just THP anymore.
>
> Or maybe allowed_hugepage_adjust()? To pair with disallowed_hugepage_adjust(),
> which adjusts KVM's page size in the opposite direction to avoid the iTLB
> multi-hit issue.
>
>>
>> Sounds good.
>>
>>>
>>> I was a little hesitant to change the this to handle 1 GB pages with this
>>> patchset at first. I didn't want to break the non-DAX case stuff by doing
>>> so.
>>
>> Why would it affect non-DAX case?
>> Your patch should just make hugepage_adjust() to parse page-tables only in case is_zone_device_page(). Otherwise, page tables shouldn’t be parsed.
>> i.e. THP merged pages should still be detected by PageTransCompoundMap().
>
> I think what Barret is saying is that teaching thp_adjust() how to do 1gb
> mappings would naturally affect the code path for THP pages. But I agree
> that it would be superficial.
>
>>> Specifically, can a THP page be 1 GB, and if so, how can you tell? If you
>>> can't tell easily, I could walk the page table for all cases, instead of
>>> just zone_device().
>
> No, THP doesn't currently support 1gb pages. Expliciting returning
> PMD_SIZE on PageTransCompoundMap() would be a good thing from a readability
> perspective.
Right.
>
>> I prefer to walk page-tables only for is_zone_device_page().
>>
>>>
>>> I'd also have to drop the "level == PT_PAGE_TABLE_LEVEL" check, I think,
>>> which would open this up to hugetlbfs pages (based on the comments). Is
>>> there any reason why that would be a bad idea?
>
> No, the "level == PT_PAGE_TABLE_LEVEL" check is to filter out the case
> where KVM is already planning on using a large page, e.g. when the memory
> is backed by hugetlbs.
Right.
>
>> KVM already supports mapping 1GB hugetlbfs pages. As level is set to
>> PUD-level by
>> tdp_page_fault()->mapping_level()->host_mapping_level()->kvm_host_page_size()->vma_kernel_pagesize().
>> As VMA which is mmap of hugetlbfs sets vma->vm_ops to hugetlb_vm_ops() where
>> hugetlb_vm_op_pagesize() will return appropriate page-size.
>>
>> Specifically, I don’t think THP ever merges small pages to 1GB pages. I think
>> this is why transparent_hugepage_adjust() checks PageTransCompoundMap() only
>> in case level == PT_PAGE_TABLE_LEVEL. I think you should keep this check in
>> the case of !is_zone_device_page().
>
> I would add 1gb support for DAX as a third patch in this series. To pave
> the way in patch 2/2, change it to replace "bool pfn_is_huge_mapped()" with
> "int host_pfn_mapping_level()", and maybe also renaming host_mapping_level()
> to host_vma_mapping_level() to avoid confusion.
I agree.
So also rename kvm_host_page_size() to kvm_host_vma_page_size() :)
>
> Then allowed_hugepage_adjust() would look something like:
>
> static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
> kvm_pfn_t *pfnp, int *levelp, int max_level)
> {
> kvm_pfn_t pfn = *pfnp;
> int level = *levelp;
> unsigned long mask;
>
> if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
> level == PT_PAGE_TABLE_LEVEL)
> return;
>
> /*
> * mmu_notifier_retry() was successful and mmu_lock is held, so
> * the pmd/pud can't be split from under us.
> */
> level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
>
> *levelp = level = min(level, max_level);
> mask = KVM_PAGES_PER_HPAGE(level) - 1;
> VM_BUG_ON((gfn & mask) != (pfn & mask));
> *pfnp = pfn & ~mask;
Why don’t you still need to kvm_release_pfn_clean() for original pfn and kvm_get_pfn() for new huge-page start pfn?
> }
Yep. This is similar to what I had in mind.
Then just put logic of parsing page-tables in case it’s is_zone_device_page() or returning PMD_SIZE in case it’s PageTransCompoundMap() inside host_pfn_mapping_level(). This make code very straight-forward.
-Liran
On Thu, Dec 12, 2019 at 01:22:37PM -0500, Barret Rhoden wrote:
> KVM has a use case for determining the size of a dax mapping.
>
> The KVM code has easy access to the address and the mm, and
> dev_pagemap_mapping_shift() needs only those parameters. It was
> deriving them from page and vma. This commit changes those parameters
> from (page, vma) to (address, mm).
>
> Signed-off-by: Barret Rhoden <[email protected]>
> Reviewed-by: David Hildenbrand <[email protected]>
> Acked-by: Dan Williams <[email protected]>
> ---
> include/linux/mm.h | 3 +++
> mm/memory-failure.c | 38 +++-----------------------------------
> mm/util.c | 34 ++++++++++++++++++++++++++++++++++
> 3 files changed, 40 insertions(+), 35 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..bfd1882dd5c6 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1013,6 +1013,9 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
> #define page_ref_zero_or_close_to_overflow(page) \
> ((unsigned int) page_ref_count(page) + 127u <= 127u)
>
> +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> + struct mm_struct *mm);
> +
> static inline void get_page(struct page *page)
> {
> page = compound_head(page);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3151c87dff73..bafa464c8290 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -261,40 +261,6 @@ void shake_page(struct page *p, int access)
> }
> EXPORT_SYMBOL_GPL(shake_page);
>
> -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> - struct vm_area_struct *vma)
> -{
> - unsigned long address = vma_address(page, vma);
> - pgd_t *pgd;
> - p4d_t *p4d;
> - pud_t *pud;
> - pmd_t *pmd;
> - pte_t *pte;
> -
> - pgd = pgd_offset(vma->vm_mm, address);
> - if (!pgd_present(*pgd))
> - return 0;
> - p4d = p4d_offset(pgd, address);
> - if (!p4d_present(*p4d))
> - return 0;
> - pud = pud_offset(p4d, address);
> - if (!pud_present(*pud))
> - return 0;
> - if (pud_devmap(*pud))
> - return PUD_SHIFT;
> - pmd = pmd_offset(pud, address);
> - if (!pmd_present(*pmd))
> - return 0;
> - if (pmd_devmap(*pmd))
> - return PMD_SHIFT;
> - pte = pte_offset_map(pmd, address);
> - if (!pte_present(*pte))
> - return 0;
> - if (pte_devmap(*pte))
> - return PAGE_SHIFT;
> - return 0;
> -}
> -
> /*
> * Failure handling: if we can't find or can't kill a process there's
> * not much we can do. We just print a message and ignore otherwise.
> @@ -324,7 +290,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
> }
> tk->addr = page_address_in_vma(p, vma);
> if (is_zone_device_page(p))
> - tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> + tk->size_shift =
> + dev_pagemap_mapping_shift(vma_address(page, vma),
> + vma->vm_mm);
> else
> tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
>
> diff --git a/mm/util.c b/mm/util.c
> index 3ad6db9a722e..59984e6b40ab 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -901,3 +901,37 @@ int memcmp_pages(struct page *page1, struct page *page2)
> kunmap_atomic(addr1);
> return ret;
> }
> +
> +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> + struct mm_struct *mm)
> +{
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> +
> + pgd = pgd_offset(mm, address);
> + if (!pgd_present(*pgd))
> + return 0;
> + p4d = p4d_offset(pgd, address);
> + if (!p4d_present(*p4d))
> + return 0;
> + pud = pud_offset(p4d, address);
> + if (!pud_present(*pud))
> + return 0;
> + if (pud_devmap(*pud))
> + return PUD_SHIFT;
> + pmd = pmd_offset(pud, address);
> + if (!pmd_present(*pmd))
> + return 0;
> + if (pmd_devmap(*pmd))
> + return PMD_SHIFT;
> + pte = pte_offset_map(pmd, address);
> + if (!pte_present(*pte))
> + return 0;
> + if (pte_devmap(*pte))
> + return PAGE_SHIFT;
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
exactly what KVM needs. E.g. KVM works with levels instead of shifts, and
it would be nice to provide the pte so that KVM can sanity check that the
pfn from this walk matches the pfn it plans on mapping.
Instead of exporting dev_pagemap_mapping_shift(), what about relacing it
with a patch to introduce lookup_address_mm() and export that?
dev_pagemap_mapping_shift() could then wrap the new helper (if you want),
and KVM could do lookup_address_mm() for querying the size of ZONE_DEVICE
pages.
On Fri, Dec 13, 2019 at 07:31:55PM +0200, Liran Alon wrote:
>
> > On 13 Dec 2019, at 19:19, Sean Christopherson <[email protected]> wrote:
> >
> > Then allowed_hugepage_adjust() would look something like:
> >
> > static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
> > kvm_pfn_t *pfnp, int *levelp, int max_level)
> > {
> > kvm_pfn_t pfn = *pfnp;
> > int level = *levelp;
> > unsigned long mask;
> >
> > if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
> > level == PT_PAGE_TABLE_LEVEL)
> > return;
> >
> > /*
> > * mmu_notifier_retry() was successful and mmu_lock is held, so
> > * the pmd/pud can't be split from under us.
> > */
> > level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
> >
> > *levelp = level = min(level, max_level);
> > mask = KVM_PAGES_PER_HPAGE(level) - 1;
> > VM_BUG_ON((gfn & mask) != (pfn & mask));
> > *pfnp = pfn & ~mask;
>
> Why don’t you still need to kvm_release_pfn_clean() for original pfn and
> kvm_get_pfn() for new huge-page start pfn?
That code is gone in kvm/queue. thp_adjust() is now called from
__direct_map() and FNAME(fetch), and so its pfn adjustment doesn't bleed
back to the page fault handlers. The only reason the put/get pfn code
existed was because the page fault handlers called kvm_release_pfn_clean()
on the pfn, i.e. they would have put the wrong pfn.
> On 13 Dec 2019, at 19:50, Sean Christopherson <[email protected]> wrote:
>
> On Fri, Dec 13, 2019 at 07:31:55PM +0200, Liran Alon wrote:
>>
>>> On 13 Dec 2019, at 19:19, Sean Christopherson <[email protected]> wrote:
>>>
>>> Then allowed_hugepage_adjust() would look something like:
>>>
>>> static void allowed_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
>>> kvm_pfn_t *pfnp, int *levelp, int max_level)
>>> {
>>> kvm_pfn_t pfn = *pfnp;
>>> int level = *levelp;
>>> unsigned long mask;
>>>
>>> if (is_error_noslot_pfn(pfn) || !kvm_is_reserved_pfn(pfn) ||
>>> level == PT_PAGE_TABLE_LEVEL)
>>> return;
>>>
>>> /*
>>> * mmu_notifier_retry() was successful and mmu_lock is held, so
>>> * the pmd/pud can't be split from under us.
>>> */
>>> level = host_pfn_mapping_level(vcpu->kvm, gfn, pfn);
>>>
>>> *levelp = level = min(level, max_level);
>>> mask = KVM_PAGES_PER_HPAGE(level) - 1;
>>> VM_BUG_ON((gfn & mask) != (pfn & mask));
>>> *pfnp = pfn & ~mask;
>>
>> Why don’t you still need to kvm_release_pfn_clean() for original pfn and
>> kvm_get_pfn() for new huge-page start pfn?
>
> That code is gone in kvm/queue. thp_adjust() is now called from
> __direct_map() and FNAME(fetch), and so its pfn adjustment doesn't bleed
> back to the page fault handlers. The only reason the put/get pfn code
> existed was because the page fault handlers called kvm_release_pfn_clean()
> on the pfn, i.e. they would have put the wrong pfn.
Ack. Thanks for the explaining this.
On Fri, Dec 13, 2019 at 9:47 AM Sean Christopherson
<[email protected]> wrote:
>
> On Thu, Dec 12, 2019 at 01:22:37PM -0500, Barret Rhoden wrote:
> > KVM has a use case for determining the size of a dax mapping.
> >
> > The KVM code has easy access to the address and the mm, and
> > dev_pagemap_mapping_shift() needs only those parameters. It was
> > deriving them from page and vma. This commit changes those parameters
> > from (page, vma) to (address, mm).
> >
> > Signed-off-by: Barret Rhoden <[email protected]>
> > Reviewed-by: David Hildenbrand <[email protected]>
> > Acked-by: Dan Williams <[email protected]>
> > ---
> > include/linux/mm.h | 3 +++
> > mm/memory-failure.c | 38 +++-----------------------------------
> > mm/util.c | 34 ++++++++++++++++++++++++++++++++++
> > 3 files changed, 40 insertions(+), 35 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index a2adf95b3f9c..bfd1882dd5c6 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1013,6 +1013,9 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
> > #define page_ref_zero_or_close_to_overflow(page) \
> > ((unsigned int) page_ref_count(page) + 127u <= 127u)
> >
> > +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> > + struct mm_struct *mm);
> > +
> > static inline void get_page(struct page *page)
> > {
> > page = compound_head(page);
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 3151c87dff73..bafa464c8290 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -261,40 +261,6 @@ void shake_page(struct page *p, int access)
> > }
> > EXPORT_SYMBOL_GPL(shake_page);
> >
> > -static unsigned long dev_pagemap_mapping_shift(struct page *page,
> > - struct vm_area_struct *vma)
> > -{
> > - unsigned long address = vma_address(page, vma);
> > - pgd_t *pgd;
> > - p4d_t *p4d;
> > - pud_t *pud;
> > - pmd_t *pmd;
> > - pte_t *pte;
> > -
> > - pgd = pgd_offset(vma->vm_mm, address);
> > - if (!pgd_present(*pgd))
> > - return 0;
> > - p4d = p4d_offset(pgd, address);
> > - if (!p4d_present(*p4d))
> > - return 0;
> > - pud = pud_offset(p4d, address);
> > - if (!pud_present(*pud))
> > - return 0;
> > - if (pud_devmap(*pud))
> > - return PUD_SHIFT;
> > - pmd = pmd_offset(pud, address);
> > - if (!pmd_present(*pmd))
> > - return 0;
> > - if (pmd_devmap(*pmd))
> > - return PMD_SHIFT;
> > - pte = pte_offset_map(pmd, address);
> > - if (!pte_present(*pte))
> > - return 0;
> > - if (pte_devmap(*pte))
> > - return PAGE_SHIFT;
> > - return 0;
> > -}
> > -
> > /*
> > * Failure handling: if we can't find or can't kill a process there's
> > * not much we can do. We just print a message and ignore otherwise.
> > @@ -324,7 +290,9 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
> > }
> > tk->addr = page_address_in_vma(p, vma);
> > if (is_zone_device_page(p))
> > - tk->size_shift = dev_pagemap_mapping_shift(p, vma);
> > + tk->size_shift =
> > + dev_pagemap_mapping_shift(vma_address(page, vma),
> > + vma->vm_mm);
> > else
> > tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
> >
> > diff --git a/mm/util.c b/mm/util.c
> > index 3ad6db9a722e..59984e6b40ab 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -901,3 +901,37 @@ int memcmp_pages(struct page *page1, struct page *page2)
> > kunmap_atomic(addr1);
> > return ret;
> > }
> > +
> > +unsigned long dev_pagemap_mapping_shift(unsigned long address,
> > + struct mm_struct *mm)
> > +{
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > + pud_t *pud;
> > + pmd_t *pmd;
> > + pte_t *pte;
> > +
> > + pgd = pgd_offset(mm, address);
> > + if (!pgd_present(*pgd))
> > + return 0;
> > + p4d = p4d_offset(pgd, address);
> > + if (!p4d_present(*p4d))
> > + return 0;
> > + pud = pud_offset(p4d, address);
> > + if (!pud_present(*pud))
> > + return 0;
> > + if (pud_devmap(*pud))
> > + return PUD_SHIFT;
> > + pmd = pmd_offset(pud, address);
> > + if (!pmd_present(*pmd))
> > + return 0;
> > + if (pmd_devmap(*pmd))
> > + return PMD_SHIFT;
> > + pte = pte_offset_map(pmd, address);
> > + if (!pte_present(*pte))
> > + return 0;
> > + if (pte_devmap(*pte))
> > + return PAGE_SHIFT;
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(dev_pagemap_mapping_shift);
>
> This is basically a rehash of lookup_address_in_pgd(), and doesn't provide
> exactly what KVM needs. E.g. KVM works with levels instead of shifts, and
> it would be nice to provide the pte so that KVM can sanity check that the
> pfn from this walk matches the pfn it plans on mapping.
>
> Instead of exporting dev_pagemap_mapping_shift(), what about relacing it
> with a patch to introduce lookup_address_mm() and export that?
>
> dev_pagemap_mapping_shift() could then wrap the new helper (if you want),
> and KVM could do lookup_address_mm() for querying the size of ZONE_DEVICE
> pages.
All of the above sounds great to me. Should have looked that much
harder when implementing dev_pagemap_mapping_shift() originally.
On Mon, Dec 16, 2019 at 11:05:26AM -0500, Barret Rhoden wrote:
> On 12/13/19 12:19 PM, Sean Christopherson wrote:
> >Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing. It's
> >unlikely THP itself will support 1GB pages any time soon, but having the
> >logic there wouldn't hurt anything.
> >
>
> Cool. This was my main concern - I didn't want to break THP.
>
> I'll rework the series based on all of your comments.
Hopefully you haven't put too much effort into the rework, because I want
to commandeer the proposed changes and use them as the basis for a more
aggressive overhaul of KVM's hugepage handling. Ironically, there's a bug
in KVM's THP handling that I _think_ can be avoided by using the DAX
approach of walking the host PTEs.
I'm in the process of testing, hopefully I'll get a series sent out later
today. If not, I should at least be able to provide an update.
Hi -
On 1/7/20 2:05 PM, Sean Christopherson wrote:
> On Mon, Dec 16, 2019 at 11:05:26AM -0500, Barret Rhoden wrote:
>> On 12/13/19 12:19 PM, Sean Christopherson wrote:
>>> Teaching thp_adjust() how to handle 1GB wouldn't be a bad thing. It's
>>> unlikely THP itself will support 1GB pages any time soon, but having the
>>> logic there wouldn't hurt anything.
>>>
>>
>> Cool. This was my main concern - I didn't want to break THP.
>>
>> I'll rework the series based on all of your comments.
>
> Hopefully you haven't put too much effort into the rework, because I want
> to commandeer the proposed changes and use them as the basis for a more
> aggressive overhaul of KVM's hugepage handling. Ironically, there's a bug
> in KVM's THP handling that I _think_ can be avoided by using the DAX
> approach of walking the host PTEs.
>
> I'm in the process of testing, hopefully I'll get a series sent out later
> today. If not, I should at least be able to provide an update.
Nice timing. I was just about to get back to this, so I haven't put any
time in yet. =)
Please CC me, and I'll try your patches out on my end.
Thanks,
Barret
On Tue, Jan 07, 2020 at 02:19:06PM -0500, Barret Rhoden wrote:
> On 1/7/20 2:05 PM, Sean Christopherson wrote:
> >Hopefully you haven't put too much effort into the rework, because I want
> >to commandeer the proposed changes and use them as the basis for a more
> >aggressive overhaul of KVM's hugepage handling. Ironically, there's a bug
> >in KVM's THP handling that I _think_ can be avoided by using the DAX
> >approach of walking the host PTEs.
> >
> >I'm in the process of testing, hopefully I'll get a series sent out later
> >today. If not, I should at least be able to provide an update.
>
> Nice timing. I was just about to get back to this, so I haven't put any
> time in yet. =)
>
> Please CC me, and I'll try your patches out on my end.
Will do. Barring last minute hiccups, the code is ready, just need to
finish off a few changelogs. Should get it out early tomorrow.
One question that may help avoid some churn: are huge DAX pages not
tracked as compound pages? The comment from your/this patch is pretty
unequivocal, but I wanted to double check that they will really return
false for PageCompound(), as opposed to only returning false for
PageTransCompoundMap().
/*
* DAX pages do not use compound pages. ...
*/
Thanks!
On Tue, Jan 7, 2020 at 5:20 PM Sean Christopherson
<[email protected]> wrote:
>
> On Tue, Jan 07, 2020 at 02:19:06PM -0500, Barret Rhoden wrote:
> > On 1/7/20 2:05 PM, Sean Christopherson wrote:
> > >Hopefully you haven't put too much effort into the rework, because I want
> > >to commandeer the proposed changes and use them as the basis for a more
> > >aggressive overhaul of KVM's hugepage handling. Ironically, there's a bug
> > >in KVM's THP handling that I _think_ can be avoided by using the DAX
> > >approach of walking the host PTEs.
> > >
> > >I'm in the process of testing, hopefully I'll get a series sent out later
> > >today. If not, I should at least be able to provide an update.
> >
> > Nice timing. I was just about to get back to this, so I haven't put any
> > time in yet. =)
> >
> > Please CC me, and I'll try your patches out on my end.
>
> Will do. Barring last minute hiccups, the code is ready, just need to
> finish off a few changelogs. Should get it out early tomorrow.
>
> One question that may help avoid some churn: are huge DAX pages not
> tracked as compound pages? The comment from your/this patch is pretty
> unequivocal, but I wanted to double check that they will really return
> false for PageCompound(), as opposed to only returning false for
> PageTransCompoundMap().
PageCompound() returns false.
>
> /*
> * DAX pages do not use compound pages. ...
> */
>
None of the head / tail page infrastructure is set up for dax pages.
They are just independent 'struct page' objects that are
opportunistically mapped by different pte sizes in the dax core.
On 16/12/19 18:59, Barret Rhoden wrote:
> Does KVM-x86 need its own names for the levels? If not, I could convert
> the PT_PAGE_TABLE_* stuff to PG_LEVEL_* stuff.
Yes, please do. For the 2M/4M case, it's only incorrect to use 2M here:
if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
And for that you can even use "> PG_LEVEL_4K" if you think it's nicer.
Paolo