If more than one futex is placed on a shmem huge page, it can happen that
waking the second wakes the first instead, and leaves the second waiting:
the key's shared.pgoff is wrong.
When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
generating futex_key"), the only shared huge pages came from hugetlbfs,
and the code added to deal with its exceptional page->index was put into
hugetlb source. Then that was missed when 4.8 added shmem huge pages.
page_to_pgoff() is what others use for this nowadays: except that, as
currently written, it gives the right answer on hugetlbfs head, but
nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific
hugetlb_basepage_index() on PageHuge tails as well as on head.
Yes, it's unconventional to declare hugetlb_basepage_index() there in
pagemap.h, rather than in hugetlb.h; but I do not expect anything but
page_to_pgoff() ever to need it.
Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
Reported-by: Neel Natu <[email protected]>
Signed-off-by: Hugh Dickins <[email protected]>
Cc: <[email protected]>
---
include/linux/hugetlb.h | 16 ----------------
include/linux/pagemap.h | 13 ++++++-------
kernel/futex.c | 3 +--
mm/hugetlb.c | 5 +----
4 files changed, 8 insertions(+), 29 deletions(-)
--- 5.13-rc5/include/linux/hugetlb.h 2021-05-09 17:25:09.278703159 -0700
+++ linux/include/linux/hugetlb.h 2021-06-11 17:30:28.726720252 -0700
@@ -733,17 +733,6 @@ static inline int hstate_index(struct hs
return h - hstates;
}
-pgoff_t __basepage_index(struct page *page);
-
-/* Return page->index in PAGE_SIZE units */
-static inline pgoff_t basepage_index(struct page *page)
-{
- if (!PageCompound(page))
- return page->index;
-
- return __basepage_index(page);
-}
-
extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
@@ -980,11 +969,6 @@ static inline int hstate_index(struct hs
return 0;
}
-static inline pgoff_t basepage_index(struct page *page)
-{
- return page->index;
-}
-
static inline int dissolve_free_huge_page(struct page *page)
{
return 0;
--- 5.13-rc5/include/linux/pagemap.h 2021-05-16 22:49:30.036176843 -0700
+++ linux/include/linux/pagemap.h 2021-06-11 17:30:28.726720252 -0700
@@ -516,8 +516,7 @@ static inline struct page *read_mapping_
}
/*
- * Get index of the page with in radix-tree
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get index of the page within radix-tree (but not for hugetlb pages).
*/
static inline pgoff_t page_to_index(struct page *page)
{
@@ -536,14 +535,14 @@ static inline pgoff_t page_to_index(stru
}
/*
- * Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
- if (unlikely(PageHeadHuge(page)))
- return page->index << compound_order(page);
-
+ if (unlikely(PageHuge(page))) {
+ extern pgoff_t hugetlb_basepage_index(struct page *page);
+ return hugetlb_basepage_index(page);
+ }
return page_to_index(page);
}
--- 5.13-rc5/kernel/futex.c 2021-05-09 17:25:09.670705811 -0700
+++ linux/kernel/futex.c 2021-06-11 17:30:28.726720252 -0700
@@ -35,7 +35,6 @@
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
-#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
@@ -650,7 +649,7 @@ again:
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = basepage_index(tail);
+ key->shared.pgoff = page_to_pgoff(tail);
rcu_read_unlock();
}
--- 5.13-rc5/mm/hugetlb.c 2021-06-06 16:57:26.263006733 -0700
+++ linux/mm/hugetlb.c 2021-06-11 17:30:28.730720276 -0700
@@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mappi
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
On Fri, Jun 11, 2021 at 09:31:16PM -0700, Hugh Dickins wrote:
> +++ linux/include/linux/pagemap.h 2021-06-11 17:30:28.726720252 -0700
> @@ -516,8 +516,7 @@ static inline struct page *read_mapping_
> }
>
> /*
> - * Get index of the page with in radix-tree
> - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
> + * Get index of the page within radix-tree (but not for hugetlb pages).
> */
I think the TODO should be retained. It's still something that I
intend to do.
> static inline pgoff_t page_to_pgoff(struct page *page)
> {
> - if (unlikely(PageHeadHuge(page)))
> - return page->index << compound_order(page);
> -
> + if (unlikely(PageHuge(page))) {
> + extern pgoff_t hugetlb_basepage_index(struct page *page);
> + return hugetlb_basepage_index(page);
> + }
> return page_to_index(page);
> }
Yes, this change absolutely makes sense. It's just a landmine if it
returns the right answer for some tail pages but garbage for other
kinds of tail pages.
> --- 5.13-rc5/mm/hugetlb.c 2021-06-06 16:57:26.263006733 -0700
> +++ linux/mm/hugetlb.c 2021-06-11 17:30:28.730720276 -0700
> @@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mappi
> return NULL;
> }
>
> -pgoff_t __basepage_index(struct page *page)
> +pgoff_t hugetlb_basepage_index(struct page *page)
> {
> struct page *page_head = compound_head(page);
> pgoff_t index = page_index(page_head);
> unsigned long compound_idx;
>
> - if (!PageHuge(page_head))
> - return page_index(page);
> -
> if (compound_order(page_head) >= MAX_ORDER)
> compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
> else
>
urgh. this trailing bit should be:
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
#else
compound_idx = page - page_head;
#endif
On Sat, 12 Jun 2021, Matthew Wilcox wrote:
> On Fri, Jun 11, 2021 at 09:31:16PM -0700, Hugh Dickins wrote:
> > +++ linux/include/linux/pagemap.h 2021-06-11 17:30:28.726720252 -0700
> > @@ -516,8 +516,7 @@ static inline struct page *read_mapping_
> > }
> >
> > /*
> > - * Get index of the page with in radix-tree
> > - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
> > + * Get index of the page within radix-tree (but not for hugetlb pages).
> > */
>
> I think the TODO should be retained. It's still something that I
> intend to do.
Okay. I did not mean to imply, by removing those TODOs, that they
should not be done: just that they were a developer's notes to self,
that I found distracting there.
I've restored both TODOs (but changed the second to say
"hugetlb pages" explicitly, rather than the ambiguous "hugepage").
> > --- 5.13-rc5/mm/hugetlb.c 2021-06-06 16:57:26.263006733 -0700
> > +++ linux/mm/hugetlb.c 2021-06-11 17:30:28.730720276 -0700
> > @@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mappi
> > return NULL;
> > }
> >
> > -pgoff_t __basepage_index(struct page *page)
> > +pgoff_t hugetlb_basepage_index(struct page *page)
> > {
> > struct page *page_head = compound_head(page);
> > pgoff_t index = page_index(page_head);
> > unsigned long compound_idx;
> >
> > - if (!PageHuge(page_head))
> > - return page_index(page);
> > -
> > if (compound_order(page_head) >= MAX_ORDER)
> > compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
> > else
> >
>
> urgh. this trailing bit should be:
>
> #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
> compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
> #else
> compound_idx = page - page_head;
> #endif
I don't see what's wrong with what's there, myself. Unfamiliar territory
to me, but mem_map_next() appears to have the same MAX_ORDER expectation.
Or perhaps you're just suggesting an optimization.
If it were obvious to me, I'd have gladly folded it in; but no,
please send your own patch for that, running it by Mike Kravetz
and Mike Rapoport and David Hildenbrand, I think.
Thanks, v2 follows,
Hugh
If more than one futex is placed on a shmem huge page, it can happen that
waking the second wakes the first instead, and leaves the second waiting:
the key's shared.pgoff is wrong.
When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
generating futex_key"), the only shared huge pages came from hugetlbfs,
and the code added to deal with its exceptional page->index was put into
hugetlb source. Then that was missed when 4.8 added shmem huge pages.
page_to_pgoff() is what others use for this nowadays: except that, as
currently written, it gives the right answer on hugetlbfs head, but
nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific
hugetlb_basepage_index() on PageHuge tails as well as on head.
Yes, it's unconventional to declare hugetlb_basepage_index() there in
pagemap.h, rather than in hugetlb.h; but I do not expect anything but
page_to_pgoff() ever to need it.
Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
Reported-by: Neel Natu <[email protected]>
Signed-off-by: Hugh Dickins <[email protected]>
Cc: <[email protected]>
---
v2: restored TODO comments, per Matthew; make "hugetlb pages" explicit.
include/linux/hugetlb.h | 16 ----------------
include/linux/pagemap.h | 13 +++++++------
kernel/futex.c | 3 +--
mm/hugetlb.c | 5 +----
4 files changed, 9 insertions(+), 28 deletions(-)
--- 5.13-rc5/include/linux/hugetlb.h 2021-05-09 17:25:09.278703159 -0700
+++ linux/include/linux/hugetlb.h 2021-06-11 17:30:28.726720252 -0700
@@ -733,17 +733,6 @@ static inline int hstate_index(struct hs
return h - hstates;
}
-pgoff_t __basepage_index(struct page *page);
-
-/* Return page->index in PAGE_SIZE units */
-static inline pgoff_t basepage_index(struct page *page)
-{
- if (!PageCompound(page))
- return page->index;
-
- return __basepage_index(page);
-}
-
extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
@@ -980,11 +969,6 @@ static inline int hstate_index(struct hs
return 0;
}
-static inline pgoff_t basepage_index(struct page *page)
-{
- return page->index;
-}
-
static inline int dissolve_free_huge_page(struct page *page)
{
return 0;
--- 5.13-rc5/include/linux/pagemap.h 2021-05-16 22:49:30.036176843 -0700
+++ linux/include/linux/pagemap.h 2021-06-12 19:29:23.364387191 -0700
@@ -516,7 +516,7 @@ static inline struct page *read_mapping_
}
/*
- * Get index of the page with in radix-tree
+ * Get index of the page within radix-tree (but not for hugetlb pages).
* (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_index(struct page *page)
@@ -536,14 +536,15 @@ static inline pgoff_t page_to_index(stru
}
/*
- * Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
+ * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
- if (unlikely(PageHeadHuge(page)))
- return page->index << compound_order(page);
-
+ if (unlikely(PageHuge(page))) {
+ extern pgoff_t hugetlb_basepage_index(struct page *page);
+ return hugetlb_basepage_index(page);
+ }
return page_to_index(page);
}
--- 5.13-rc5/kernel/futex.c 2021-05-09 17:25:09.670705811 -0700
+++ linux/kernel/futex.c 2021-06-11 17:30:28.726720252 -0700
@@ -35,7 +35,6 @@
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
-#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
@@ -650,7 +649,7 @@ again:
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
- key->shared.pgoff = basepage_index(tail);
+ key->shared.pgoff = page_to_pgoff(tail);
rcu_read_unlock();
}
--- 5.13-rc5/mm/hugetlb.c 2021-06-06 16:57:26.263006733 -0700
+++ linux/mm/hugetlb.c 2021-06-11 17:30:28.730720276 -0700
@@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mappi
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
On Sat, Jun 12, 2021 at 08:16:58PM -0700, Hugh Dickins wrote:
> If more than one futex is placed on a shmem huge page, it can happen that
> waking the second wakes the first instead, and leaves the second waiting:
> the key's shared.pgoff is wrong.
>
> When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
> generating futex_key"), the only shared huge pages came from hugetlbfs,
> and the code added to deal with its exceptional page->index was put into
> hugetlb source. Then that was missed when 4.8 added shmem huge pages.
>
> page_to_pgoff() is what others use for this nowadays: except that, as
> currently written, it gives the right answer on hugetlbfs head, but
> nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific
> hugetlb_basepage_index() on PageHuge tails as well as on head.
>
> Yes, it's unconventional to declare hugetlb_basepage_index() there in
> pagemap.h, rather than in hugetlb.h; but I do not expect anything but
> page_to_pgoff() ever to need it.
>
> Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
> Reported-by: Neel Natu <[email protected]>
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: <[email protected]>
Reviewed-by: Matthew Wilcox (Oracle) <[email protected]>
On Sat, Jun 12 2021 at 20:16, Hugh Dickins wrote:
> If more than one futex is placed on a shmem huge page, it can happen that
> waking the second wakes the first instead, and leaves the second waiting:
> the key's shared.pgoff is wrong.
>
> When 3.11 commit 13d60f4b6ab5 ("futex: Take hugepages into account when
> generating futex_key"), the only shared huge pages came from hugetlbfs,
> and the code added to deal with its exceptional page->index was put into
> hugetlb source. Then that was missed when 4.8 added shmem huge pages.
>
> page_to_pgoff() is what others use for this nowadays: except that, as
> currently written, it gives the right answer on hugetlbfs head, but
> nonsense on hugetlbfs tails. Fix that by calling hugetlbfs-specific
> hugetlb_basepage_index() on PageHuge tails as well as on head.
>
> Yes, it's unconventional to declare hugetlb_basepage_index() there in
> pagemap.h, rather than in hugetlb.h; but I do not expect anything but
> page_to_pgoff() ever to need it.
>
> Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
> Reported-by: Neel Natu <[email protected]>
> Signed-off-by: Hugh Dickins <[email protected]>
> Cc: <[email protected]>
Assuming this goes through mm:
Acked-by: Thomas Gleixner <[email protected]>