Tail page struct reuse is over-comlicated. Not only because we have
implicit uses of tail page fields (mapcounts, or private for thp swap
support, etc., that we _may_ still use in the page structs, but not obvious
the relationship between that and the folio definitions), but also because
we have 32/64 bits layouts for struct page so it's unclear what we can use
and what we cannot when trying to find a new spot in folio struct.
We also have tricks like page->mapping, where we can reuse only the tail
page 1/2 but nothing more than tail page 2. It is all mostly hidden, until
someone starts to read into a VM_BUG_ON_PAGE() of __split_huge_page_tail().
Let's document it clearly on what we can use and what we can't, with 100%
explanations on each of them. Hopefully this will make:
(1) Any reader to know exactly what field is where and for what, the
relationships between folio tail pages and struct page definitions,
(2) Any potential new fields to be added to a large folio, so we're clear
which field one can still reuse (look for _reserved* ones).
This is assuming WORD is defined as sizeof(void *) on any archs, just like
the other comment in struct page we already have.
One pitfall is I'll need to split part of the tail page 1 definition into
32/64 bits differently, that introduced some duplications on the fields.
But hopefully that's worthwhile as it makes everything crystal clear. Not
to mention that "pitfall" also brings a benefit that we can actually define
fields in different order for 32/64 bits when we want.
Signed-off-by: Peter Xu <[email protected]>
---
include/linux/mm_types.h | 76 +++++++++++++++++++++++++++++++++++-----
1 file changed, 67 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 291c05cacd48..3e40e1b9fec3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -313,41 +313,99 @@ struct folio {
};
struct page page;
};
+ /*
+ * Some of the tail page fields (out of 8 WORDs for either 32/64
+ * bits archs) may not be reused by the folio object because
+ * they're already been used by the page struct:
+ *
+ * |-------+---------------|
+ * | Index | Field |
+ * |-------+---------------|
+ * | 0 | flag |
+ * | 1 | compound_head |
+ * | 2 | N/A [0] |
+ * | 3 | mapping [1] |
+ * | 4 | N/A [0] |
+ * | 5 | private [2] |
+ * | 6 | mapcount |
+ * | 7 | N/A [0] |
+ * |-------+---------------|
+ *
+ * [0] "N/A" marks fields that are available to leverage for the
+ * large folio.
+ *
+ * [1] "mapping" field is only used for sanity check, see
+ * TAIL_MAPPING. Still valid to use for tail pages 1/2.
+ * (for that, see __split_huge_page_tail()).
+ *
+ * [2] "private" field is used when THP_SWAP is on (disabled on 32
+ * bits, or on hugetlb folios) .
+ */
union {
struct {
+ /* WORD 0-1: not valid to reuse */
unsigned long _flags_1;
unsigned long _head_1;
- /* public: */
+ /* WORD 2 */
unsigned char _folio_dtor;
unsigned char _folio_order;
+ unsigned char _holes_1[2];
+#ifdef CONFIG_64BIT
atomic_t _entire_mapcount;
+ /* WORD 3 */
atomic_t _nr_pages_mapped;
atomic_t _pincount;
-#ifdef CONFIG_64BIT
+ /* WORD 4 */
unsigned int _folio_nr_pages;
+ unsigned int _reserved_1_1;
+ /* WORD 5-6: not valid to reuse */
+ unsigned long _used_1_2[2];
+ /* WORD 7 */
+ unsigned long _reserved_1_2;
+#else
+ /* WORD 3 */
+ atomic_t _entire_mapcount;
+ /* WORD 4 */
+ atomic_t _nr_pages_mapped;
+ /* WORD 5: only valid for 32bits */
+ atomic_t _pincount;
+ /* WORD 6: not valid to reuse */
+ unsigned long _used_1_2;
+ /* WORD 7 */
+ unsigned long _reserved_1;
#endif
- /* private: the union with struct page is transitional */
};
+ /* private: the union with struct page is transitional */
struct page __page_1;
};
union {
struct {
+ /* WORD 0-1: not valid to reuse */
unsigned long _flags_2;
unsigned long _head_2;
- /* public: */
+ /* WORD 2-5 */
void *_hugetlb_subpool;
void *_hugetlb_cgroup;
void *_hugetlb_cgroup_rsvd;
void *_hugetlb_hwpoison;
- /* private: the union with struct page is transitional */
+ /* WORD 6: not valid to reuse */
+ unsigned long _used_2_2;
+ /* WORD 7: */
+ unsigned long _reserved_2_1;
};
struct {
- unsigned long _flags_2a;
- unsigned long _head_2a;
- /* public: */
+ /* WORD 0-1: not valid to reuse */
+ unsigned long _used_2_3[2];
+ /* WORD 2-3: */
struct list_head _deferred_list;
- /* private: the union with struct page is transitional */
+ /* WORD 4: */
+ unsigned long _reserved_2_2;
+ /* WORD 5-6: not valid to reuse */
+ unsigned long _used_2_4[2];
+ /* WORD 7: */
+ unsigned long _reserved_2_3;
};
+ /* private: the union with struct page is transitional */
struct page __page_2;
};
};
--
2.41.0
On Thu, Aug 10, 2023 at 04:49:44PM -0400, Peter Xu wrote:
> Tail page struct reuse is over-comlicated. Not only because we have
> implicit uses of tail page fields (mapcounts, or private for thp swap
> support, etc., that we _may_ still use in the page structs, but not obvious
> the relationship between that and the folio definitions), but also because
> we have 32/64 bits layouts for struct page so it's unclear what we can use
> and what we cannot when trying to find a new spot in folio struct.
I do not like this patch.
> We also have tricks like page->mapping, where we can reuse only the tail
> page 1/2 but nothing more than tail page 2. It is all mostly hidden, until
> someone starts to read into a VM_BUG_ON_PAGE() of __split_huge_page_tail().
We can change those BUG_ON if we want to reuse mapping in more tail pages.
Ask!
> Let's document it clearly on what we can use and what we can't, with 100%
> explanations on each of them. Hopefully this will make:
The explanations are still very page centric. I do not like the style
of them, nor how you explain them.
> One pitfall is I'll need to split part of the tail page 1 definition into
> 32/64 bits differently, that introduced some duplications on the fields.
> But hopefully that's worthwhile as it makes everything crystal clear. Not
> to mention that "pitfall" also brings a benefit that we can actually define
> fields in different order for 32/64 bits when we want.
No. This is going to ruin kernel-doc.
> + /*
> + * Some of the tail page fields (out of 8 WORDs for either 32/64
There's your first mistake; struct page is not necessarily 8 WORDs.
You've got 7 words for sure, then on 32-bit you have 8 because atomic_t
is word-sized. memcg_data might be the 9th word, virtual could be
the tenth, two awful kmsan intrusions could bring it to twelve, and
last_cpupid could bring it to thirteen.
Sure, it's 8 words on x86-64 with CONFIG_MEMCG enabled. But that's
just your system.
> + * bits archs) may not be reused by the folio object because
> + * they're already been used by the page struct:
> + *
> + * |-------+---------------|
> + * | Index | Field |
> + * |-------+---------------|
> + * | 0 | flag |
> + * | 1 | compound_head |
> + * | 2 | N/A [0] |
> + * | 3 | mapping [1] |
> + * | 4 | N/A [0] |
> + * | 5 | private [2] |
> + * | 6 | mapcount |
> + * | 7 | N/A [0] |
This is wrong. You mustn't reuse refcount. refcount must remain 0 on
all tail pages. And you can't use anything after refcount, because it's
all optional on various configurations.
> + * |-------+---------------|
> + *
> + * [0] "N/A" marks fields that are available to leverage for the
> + * large folio.
N/A is a bad way to say this. "Free" or "Available" would be better.
> + * [1] "mapping" field is only used for sanity check, see
> + * TAIL_MAPPING. Still valid to use for tail pages 1/2.
> + * (for that, see __split_huge_page_tail()).
No, definitely wrong to document this.
> + * [2] "private" field is used when THP_SWAP is on (disabled on 32
> + * bits, or on hugetlb folios) .
Ugh, this needs to be fixed, not documented. If you really must
document it, at least say that this needs to be fixed.
> + */
> union {
> struct {
> + /* WORD 0-1: not valid to reuse */
... so now you're repeating all the information you provided above?
> unsigned long _flags_1;
> unsigned long _head_1;
> - /* public: */
... did you check kernel-doc after removing this?
> + /* WORD 2 */
> unsigned char _folio_dtor;
> unsigned char _folio_order;
> + unsigned char _holes_1[2];
No. If you need to search for holes, use pahole.
> +#ifdef CONFIG_64BIT
> atomic_t _entire_mapcount;
> + /* WORD 3 */
> atomic_t _nr_pages_mapped;
> atomic_t _pincount;
> -#ifdef CONFIG_64BIT
> + /* WORD 4 */
> unsigned int _folio_nr_pages;
> + unsigned int _reserved_1_1;
> + /* WORD 5-6: not valid to reuse */
> + unsigned long _used_1_2[2];
> + /* WORD 7 */
> + unsigned long _reserved_1_2;
> +#else
> + /* WORD 3 */
> + atomic_t _entire_mapcount;
> + /* WORD 4 */
> + atomic_t _nr_pages_mapped;
> + /* WORD 5: only valid for 32bits */
> + atomic_t _pincount;
> + /* WORD 6: not valid to reuse */
> + unsigned long _used_1_2;
> + /* WORD 7 */
> + unsigned long _reserved_1;
> #endif
> - /* private: the union with struct page is transitional */
> };
> + /* private: the union with struct page is transitional */
You don't understand why I did it like this. Again, you have to build
the kernel-doc and you'll see why the private: and public: markers are
where they are.
There was even a thread on it, a year or two ago, where I outlined the
various tradeoffs between complexity of the output and the complexity
of the input.