Unlike many architectures, powerpc 8xx hardware tablewalk requires
a two level process for all page sizes, allthough second level only
has one entry when pagesize is 8M.
To fit with Linux page table topology and without requiring special
page directory layout like hugepd, the page entry will be replicated
1024 times in the standard page table. However for large pages it is
necessary to set bits in the level-1 (PMD) entry. At the time being,
for 512k pages the flag is kept in the PTE and inserted in the PMD
entry at TLB miss exception, that is necessary because we can have
pages of different sizes in a page table. However the 12 PTE bits are
fully used and there is no room for an additional bit for page size.
For 8M pages, there will be only one page per PMD entry, it is
therefore possible to flag the pagesize in the PMD entry, with the
advantage that the information will already be at the right place for
the hardware.
To do so, add a new helper called pmd_populate_size() which takes the
page size as an additional argument, and modify __pte_alloc() to also
take that argument. pte_alloc() is left unmodified in order to
reduce churn on callers, and a pte_alloc_size() is added for use by
pte_alloc_huge().
When an architecture doesn't provide pmd_populate_size(),
pmd_populate() is used as a fallback.
Signed-off-by: Christophe Leroy <[email protected]>
---
include/linux/mm.h | 12 +++++++-----
mm/filemap.c | 2 +-
mm/internal.h | 2 +-
mm/memory.c | 19 ++++++++++++-------
mm/pgalloc-track.h | 2 +-
mm/userfaultfd.c | 4 ++--
6 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c0910bc3e4a..6c5c15955d4e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2801,8 +2801,8 @@ static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
-int __pte_alloc_kernel(pmd_t *pmd);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long sz);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long sz);
#if defined(CONFIG_MMU)
@@ -2987,7 +2987,8 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
pte_unmap(pte); \
} while (0)
-#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))
+#define pte_alloc_size(mm, pmd, sz) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd, sz))
+#define pte_alloc(mm, pmd) pte_alloc_size(mm, pmd, PAGE_SIZE)
#define pte_alloc_map(mm, pmd, address) \
(pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))
@@ -2996,9 +2997,10 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
(pte_alloc(mm, pmd) ? \
NULL : pte_offset_map_lock(mm, pmd, address, ptlp))
-#define pte_alloc_kernel(pmd, address) \
- ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
+#define pte_alloc_kernel_size(pmd, address, sz) \
+ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, sz))? \
NULL: pte_offset_kernel(pmd, address))
+#define pte_alloc_kernel(pmd, address) pte_alloc_kernel_size(pmd, address, PAGE_SIZE)
#if USE_SPLIT_PMD_PTLOCKS
diff --git a/mm/filemap.c b/mm/filemap.c
index 7437b2bd75c1..b013000ea84f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3428,7 +3428,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
}
if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
- pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+ pmd_install(mm, vmf->pmd, &vmf->prealloc_pte, PAGE_SIZE);
return false;
}
diff --git a/mm/internal.h b/mm/internal.h
index 7e486f2c502c..b81c3ca59f45 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -206,7 +206,7 @@ void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
unsigned long ceiling, bool mm_wr_locked);
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte, unsigned long sz);
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
diff --git a/mm/memory.c b/mm/memory.c
index f2bc6dd15eb8..c846bb75746b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -409,7 +409,12 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
} while (vma);
}
-void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
+#ifndef pmd_populate_size
+#define pmd_populate_size(mm, pmdp, pte, sz) pmd_populate(mm, pmdp, pte)
+#define pmd_populate_kernel_size(mm, pmdp, pte, sz) pmd_populate_kernel(mm, pmdp, pte)
+#endif
+
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte, unsigned long sz)
{
spinlock_t *ptl = pmd_lock(mm, pmd);
@@ -429,25 +434,25 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
- pmd_populate(mm, pmd, *pte);
+ pmd_populate_size(mm, pmd, *pte, sz);
*pte = NULL;
}
spin_unlock(ptl);
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long sz)
{
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
- pmd_install(mm, pmd, &new);
+ pmd_install(mm, pmd, &new, sz);
if (new)
pte_free(mm, new);
return 0;
}
-int __pte_alloc_kernel(pmd_t *pmd)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long sz)
{
pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
@@ -456,7 +461,7 @@ int __pte_alloc_kernel(pmd_t *pmd)
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
smp_wmb(); /* See comment in pmd_install() */
- pmd_populate_kernel(&init_mm, pmd, new);
+ pmd_populate_kernel_size(&init_mm, pmd, new, sz);
new = NULL;
}
spin_unlock(&init_mm.page_table_lock);
@@ -4738,7 +4743,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
}
if (vmf->prealloc_pte)
- pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+ pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte, PAGE_SIZE);
else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
}
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
index e9e879de8649..90e37de7ab77 100644
--- a/mm/pgalloc-track.h
+++ b/mm/pgalloc-track.h
@@ -45,7 +45,7 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
#define pte_alloc_kernel_track(pmd, address, mask) \
((unlikely(pmd_none(*(pmd))) && \
- (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ (__pte_alloc_kernel(pmd, PAGE_SIZE) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
NULL: pte_offset_kernel(pmd, address))
#endif /* _LINUX_PGALLOC_TRACK_H */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 712160cd41ec..9baf507ce193 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -764,7 +764,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, PAGE_SIZE))) {
err = -ENOMEM;
break;
}
@@ -1686,7 +1686,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
err = -ENOENT;
break;
}
- if (unlikely(__pte_alloc(mm, src_pmd))) {
+ if (unlikely(__pte_alloc(mm, src_pmd, PAGE_SIZE))) {
err = -ENOMEM;
break;
}
--
2.43.0
Le 25/03/2024 à 17:19, Jason Gunthorpe a écrit :
> On Mon, Mar 25, 2024 at 03:55:54PM +0100, Christophe Leroy wrote:
>> Unlike many architectures, powerpc 8xx hardware tablewalk requires
>> a two level process for all page sizes, allthough second level only
>> has one entry when pagesize is 8M.
>>
>> To fit with Linux page table topology and without requiring special
>> page directory layout like hugepd, the page entry will be replicated
>> 1024 times in the standard page table. However for large pages it is
>> necessary to set bits in the level-1 (PMD) entry. At the time being,
>> for 512k pages the flag is kept in the PTE and inserted in the PMD
>> entry at TLB miss exception, that is necessary because we can have
>> pages of different sizes in a page table. However the 12 PTE bits are
>> fully used and there is no room for an additional bit for page size.
>>
>> For 8M pages, there will be only one page per PMD entry, it is
>> therefore possible to flag the pagesize in the PMD entry, with the
>> advantage that the information will already be at the right place for
>> the hardware.
>>
>> To do so, add a new helper called pmd_populate_size() which takes the
>> page size as an additional argument, and modify __pte_alloc() to also
>> take that argument. pte_alloc() is left unmodified in order to
>> reduce churn on callers, and a pte_alloc_size() is added for use by
>> pte_alloc_huge().
>>
>> When an architecture doesn't provide pmd_populate_size(),
>> pmd_populate() is used as a fallback.
>
> I think it would be a good idea to document what the semantic is
> supposed to be for sz?
>
> Just a general remark, probably nothing for this, but with these new
> arguments the historical naming seems pretty tortured for
> pte_alloc_size().. Something like pmd_populate_leaf(size) as a naming
> scheme would make this more intuitive. Ie pmd_populate_leaf() gives
> you a PMD entry where the entry points to a leaf page table able to
> store folios of at least size.
>
> Anyhow, I thought the edits to the mm helpers were fine, certainly
> much nicer than hugepd. Do you see a path to remove hugepd entirely
> from here?
Not looked into details yet, but I guess so.
By the way there is a wiki dedicated to huge pages on powerpc, you can
have a look at it here :
https://github.com/linuxppc/wiki/wiki/Huge-pages , maybe you'll find
good ideas there to help me.
Christophe
On Mon, Mar 25, 2024 at 03:55:54PM +0100, Christophe Leroy wrote:
> Unlike many architectures, powerpc 8xx hardware tablewalk requires
> a two level process for all page sizes, allthough second level only
> has one entry when pagesize is 8M.
>
> To fit with Linux page table topology and without requiring special
> page directory layout like hugepd, the page entry will be replicated
> 1024 times in the standard page table. However for large pages it is
> necessary to set bits in the level-1 (PMD) entry. At the time being,
> for 512k pages the flag is kept in the PTE and inserted in the PMD
> entry at TLB miss exception, that is necessary because we can have
> pages of different sizes in a page table. However the 12 PTE bits are
> fully used and there is no room for an additional bit for page size.
>
> For 8M pages, there will be only one page per PMD entry, it is
> therefore possible to flag the pagesize in the PMD entry, with the
> advantage that the information will already be at the right place for
> the hardware.
>
> To do so, add a new helper called pmd_populate_size() which takes the
> page size as an additional argument, and modify __pte_alloc() to also
> take that argument. pte_alloc() is left unmodified in order to
> reduce churn on callers, and a pte_alloc_size() is added for use by
> pte_alloc_huge().
>
> When an architecture doesn't provide pmd_populate_size(),
> pmd_populate() is used as a fallback.
I think it would be a good idea to document what the semantic is
supposed to be for sz?
Just a general remark, probably nothing for this, but with these new
arguments the historical naming seems pretty tortured for
pte_alloc_size().. Something like pmd_populate_leaf(size) as a naming
scheme would make this more intuitive. Ie pmd_populate_leaf() gives
you a PMD entry where the entry points to a leaf page table able to
store folios of at least size.
Anyhow, I thought the edits to the mm helpers were fine, certainly
much nicer than hugepd. Do you see a path to remove hugepd entirely
from here?
Thanks,
Jason
On Mon, Mar 25, 2024 at 07:05:01PM +0000, Christophe Leroy wrote:
> Not looked into details yet, but I guess so.
>
> By the way there is a wiki dedicated to huge pages on powerpc, you can
> have a look at it here :
> https://github.com/linuxppc/wiki/wiki/Huge-pages , maybe you'll find
> good ideas there to help me.
There sure are alot of page tables types here
I'm a bit wondering about terminology, eg on the first diagram "huge
pte entry" means a PUD entry that is a leaf? Which ones are contiguous
replications?
Just general remarks on the ones with huge pages:
hash 64k and hugepage 16M/16G
radix 64k/radix hugepage 2M/1G
radix 4k/radix hugepage 2M/1G
nohash 32
- I think this is just a normal x86 like scheme? PMD/PUD can be a
leaf with the same size as a next level table.
Do any of these cases need to know the higher level to parse the
lower? eg is there a 2M bit in the PUD indicating that the PMD
is a table of 2M leafs or does each PMD entry have a bit
indicating it is a leaf?
hash 4k and hugepage 16M/16G
nohash 64
- How does this work? I guess since 8xx explicitly calls out
consecutive this is actually the pgd can point to 512 256M
entries or 8 16G entries? Ie the table size at each level is
varable? Or is it the same and the table size is still 512 and
each 16G entry is replicated 64 times?
Do the offset accessors already abstract this enough?
8xx 4K
8xx 16K
- As this series does?
Jason
Le 26/03/2024 à 16:01, Jason Gunthorpe a écrit :
> On Mon, Mar 25, 2024 at 07:05:01PM +0000, Christophe Leroy wrote:
>
>> Not looked into details yet, but I guess so.
>>
>> By the way there is a wiki dedicated to huge pages on powerpc, you can
>> have a look at it here :
>> https://github.com/linuxppc/wiki/wiki/Huge-pages , maybe you'll find
>> good ideas there to help me.
>
> There sure are alot of page tables types here
>
> I'm a bit wondering about terminology, eg on the first diagram "huge
> pte entry" means a PUD entry that is a leaf? Which ones are contiguous
> replications?
Yes, on the first diagram, a huge pte entry covering the same size as
pud entry means a leaf PUD entry.
Contiguous replications are only on 8xx for the time being and are
displayed as "consecutive entries".
>
> Just general remarks on the ones with huge pages:
>
> hash 64k and hugepage 16M/16G
> radix 64k/radix hugepage 2M/1G
> radix 4k/radix hugepage 2M/1G
> nohash 32
> - I think this is just a normal x86 like scheme? PMD/PUD can be a
> leaf with the same size as a next level table.
>
> Do any of these cases need to know the higher level to parse the
> lower? eg is there a 2M bit in the PUD indicating that the PMD
> is a table of 2M leafs or does each PMD entry have a bit
> indicating it is a leaf?
For hash and radix there is a bit that tells it is leaf (_PAGE_PTE)
For nohash32/e500 I think the drawing is not full right, there is a huge
page directory (hugepd) with a single entry. I think it should be
possible to change it to a leaf entry, it seems we have bit _PAGE_SW1
available in the PTE.
>
> hash 4k and hugepage 16M/16G
> nohash 64
> - How does this work? I guess since 8xx explicitly calls out
> consecutive this is actually the pgd can point to 512 256M
> entries or 8 16G entries? Ie the table size at each level is
> varable? Or is it the same and the table size is still 512 and
> each 16G entry is replicated 64 times?
For those it is using the huge page directory (hugepd) which can be
hooked at any level and is a directory of huge pages on its own. There
is no consecutive entries involved here I think, allthough I'm not
completely sure.
For hash4k I'm not sure how it works, this was changed by commit
e2b3d202d1db ("powerpc: Switch 16GB and 16MB explicit hugepages to a
different page table format")
For the nohash/64, a PGD entry points either to a regular PUD directory
or to a HUGEPD directory. The size of the HUGEPD directory is encoded in
the 6 lower bits of the PGD entry.
>
> Do the offset accessors already abstract this enough?
>
> 8xx 4K
> 8xx 16K
> - As this series does?
This is how it is prior to the series, ie 16k and 512k pages are
implemented as contiguous PTEs in a standard page table while 8M pages
are implemented with hugepd and a single entry in it (with two PGD
entries pointing to the same huge page directory.
Christophe
On Wed, Mar 27, 2024 at 09:58:35AM +0000, Christophe Leroy wrote:
> > Just general remarks on the ones with huge pages:
> >
> > hash 64k and hugepage 16M/16G
> > radix 64k/radix hugepage 2M/1G
> > radix 4k/radix hugepage 2M/1G
> > nohash 32
> > - I think this is just a normal x86 like scheme? PMD/PUD can be a
> > leaf with the same size as a next level table.
> >
> > Do any of these cases need to know the higher level to parse the
> > lower? eg is there a 2M bit in the PUD indicating that the PMD
> > is a table of 2M leafs or does each PMD entry have a bit
> > indicating it is a leaf?
>
> For hash and radix there is a bit that tells it is leaf (_PAGE_PTE)
>
> For nohash32/e500 I think the drawing is not full right, there is a huge
> page directory (hugepd) with a single entry. I think it should be
> possible to change it to a leaf entry, it seems we have bit _PAGE_SW1
> available in the PTE.
It sounds to me like PPC breaks down into only a couple fundamental
behaviors
- x86 like leaf in many page levels. Use the pgd/pud/pmd_leaf() and
related to implement it
- ARM like contig PTE within a single page table level. Use the
contig sutff to implement it
- Contig PTE across two page table levels with a bit in the
PMD. Needs new support like you showed
- Page table levels with a variable page size. Ie a PUD can point to
a directory of 8 pages or 512 pages of different size. Probbaly
needs some new core support, but I think your changes to the
*_offset go a long way already.
> >
> > hash 4k and hugepage 16M/16G
> > nohash 64
> > - How does this work? I guess since 8xx explicitly calls out
> > consecutive this is actually the pgd can point to 512 256M
> > entries or 8 16G entries? Ie the table size at each level is
> > varable? Or is it the same and the table size is still 512 and
> > each 16G entry is replicated 64 times?
>
> For those it is using the huge page directory (hugepd) which can be
> hooked at any level and is a directory of huge pages on its own. There
> is no consecutive entries involved here I think, allthough I'm not
> completely sure.
>
> For hash4k I'm not sure how it works, this was changed by commit
> e2b3d202d1db ("powerpc: Switch 16GB and 16MB explicit hugepages to a
> different page table format")
>
> For the nohash/64, a PGD entry points either to a regular PUD directory
> or to a HUGEPD directory. The size of the HUGEPD directory is encoded in
> the 6 lower bits of the PGD entry.
If it is a software walker there might be value in just aligning to
the contig pte scheme in all levels and forgetting about the variable
size page table levels. That quarter page stuff is a PITA to manage
the memory allocation for on PPC anyhow..
Jason
Le 27/03/2024 à 17:57, Jason Gunthorpe a écrit :
> On Wed, Mar 27, 2024 at 09:58:35AM +0000, Christophe Leroy wrote:
>>> Just general remarks on the ones with huge pages:
>>>
>>> hash 64k and hugepage 16M/16G
>>> radix 64k/radix hugepage 2M/1G
>>> radix 4k/radix hugepage 2M/1G
>>> nohash 32
>>> - I think this is just a normal x86 like scheme? PMD/PUD can be a
>>> leaf with the same size as a next level table.
>>>
>>> Do any of these cases need to know the higher level to parse the
>>> lower? eg is there a 2M bit in the PUD indicating that the PMD
>>> is a table of 2M leafs or does each PMD entry have a bit
>>> indicating it is a leaf?
>>
>> For hash and radix there is a bit that tells it is leaf (_PAGE_PTE)
>>
>> For nohash32/e500 I think the drawing is not full right, there is a huge
>> page directory (hugepd) with a single entry. I think it should be
>> possible to change it to a leaf entry, it seems we have bit _PAGE_SW1
>> available in the PTE.
>
> It sounds to me like PPC breaks down into only a couple fundamental
> behaviors
> - x86 like leaf in many page levels. Use the pgd/pud/pmd_leaf() and
> related to implement it
> - ARM like contig PTE within a single page table level. Use the
> contig sutff to implement it
> - Contig PTE across two page table levels with a bit in the
> PMD. Needs new support like you showed
> - Page table levels with a variable page size. Ie a PUD can point to
> a directory of 8 pages or 512 pages of different size. Probbaly
> needs some new core support, but I think your changes to the
> *_offset go a long way already.
>
>>>
>>> hash 4k and hugepage 16M/16G
>>> nohash 64
>>> - How does this work? I guess since 8xx explicitly calls out
>>> consecutive this is actually the pgd can point to 512 256M
>>> entries or 8 16G entries? Ie the table size at each level is
>>> varable? Or is it the same and the table size is still 512 and
>>> each 16G entry is replicated 64 times?
>>
>> For those it is using the huge page directory (hugepd) which can be
>> hooked at any level and is a directory of huge pages on its own. There
>> is no consecutive entries involved here I think, allthough I'm not
>> completely sure.
>>
>> For hash4k I'm not sure how it works, this was changed by commit
>> e2b3d202d1db ("powerpc: Switch 16GB and 16MB explicit hugepages to a
>> different page table format")
>>
>> For the nohash/64, a PGD entry points either to a regular PUD directory
>> or to a HUGEPD directory. The size of the HUGEPD directory is encoded in
>> the 6 lower bits of the PGD entry.
>
> If it is a software walker there might be value in just aligning to
> the contig pte scheme in all levels and forgetting about the variable
> size page table levels. That quarter page stuff is a PITA to manage
> the memory allocation for on PPC anyhow..
Looking one step further, into nohash/32, I see a challenge: on that
platform, a PTE is 64 bits while a PGD/PMD entry is 32 bits. It is
therefore not possible as such to do PMD leaf or cont-PMD leaf.
I see two possible solutions:
- Double the size of PGD/PMD entries, but then we loose atomicity when
reading or writing an entry, could this be a problem ?
- Do as for the 8xx, ie go down to PTEs even for pages greater than 4M.
Any thought ?
Christophe
On Wed, Apr 03, 2024 at 06:24:38PM +0000, Christophe Leroy wrote:
> > If it is a software walker there might be value in just aligning to
> > the contig pte scheme in all levels and forgetting about the variable
> > size page table levels. That quarter page stuff is a PITA to manage
> > the memory allocation for on PPC anyhow..
>
> Looking one step further, into nohash/32, I see a challenge: on that
> platform, a PTE is 64 bits while a PGD/PMD entry is 32 bits. It is
> therefore not possible as such to do PMD leaf or cont-PMD leaf.
Hmm, maybe not, I have a feeling you can hide this detail in the
pmd_offset routine if you pass in the PGD information too.
> - Double the size of PGD/PMD entries, but then we loose atomicity when
> reading or writing an entry, could this be a problem ?
How does the 64 bit PTE work then? We have ignored this bug on x86 32
bit, but there is a general difficult race with 64 bit atomicity on 32
bit CPUs in the page tables.
Ideally you'd have 64 bit entries at the PMD level that encode the
page size the same as the PTE level. So you hit any level and you know
your size. This is less memory efficient (though every other arch
tolerates this) in general cases.
Can you throw away some bits of PA in the 32 bit entries to signal a
size?
> - Do as for the 8xx, ie go down to PTEs even for pages greater than 4M.
Aside from the memory waste, this is the most logical thing, go down
far enough that you can encode the desired page size in the PTE and
use the contig PTE scheme.
Jason
Le 25/03/2024 à 17:19, Jason Gunthorpe a écrit :
> On Mon, Mar 25, 2024 at 03:55:54PM +0100, Christophe Leroy wrote:
>> Unlike many architectures, powerpc 8xx hardware tablewalk requires
>> a two level process for all page sizes, allthough second level only
>> has one entry when pagesize is 8M.
>>
>> To fit with Linux page table topology and without requiring special
>> page directory layout like hugepd, the page entry will be replicated
>> 1024 times in the standard page table. However for large pages it is
>> necessary to set bits in the level-1 (PMD) entry. At the time being,
>> for 512k pages the flag is kept in the PTE and inserted in the PMD
>> entry at TLB miss exception, that is necessary because we can have
>> pages of different sizes in a page table. However the 12 PTE bits are
>> fully used and there is no room for an additional bit for page size.
>>
>> For 8M pages, there will be only one page per PMD entry, it is
>> therefore possible to flag the pagesize in the PMD entry, with the
>> advantage that the information will already be at the right place for
>> the hardware.
>>
>> To do so, add a new helper called pmd_populate_size() which takes the
>> page size as an additional argument, and modify __pte_alloc() to also
>> take that argument. pte_alloc() is left unmodified in order to
>> reduce churn on callers, and a pte_alloc_size() is added for use by
>> pte_alloc_huge().
>>
>> When an architecture doesn't provide pmd_populate_size(),
>> pmd_populate() is used as a fallback.
>
> I think it would be a good idea to document what the semantic is
> supposed to be for sz?
>
> Just a general remark, probably nothing for this, but with these new
> arguments the historical naming seems pretty tortured for
> pte_alloc_size().. Something like pmd_populate_leaf(size) as a naming
> scheme would make this more intuitive. Ie pmd_populate_leaf() gives
> you a PMD entry where the entry points to a leaf page table able to
> store folios of at least size.
I removed patches 1 and 2 and now add bit _PMD_PAGE_8M in PMD entry
afterwards in set_huge_pte_at()
>
> Anyhow, I thought the edits to the mm helpers were fine, certainly
> much nicer than hugepd. Do you see a path to remove hugepd entirely
> from here?
>
> Thanks,
> Jason