mm/memfd.c is that last remaining user of total_mapcount(). Let's
convert memfd_tag_pins() and memfd_wait_for_pins() to use folios
instead of pages, so we can remove total_mapcount() for good.
We always get a head page, so we can just naturally interpret is as a folio
(similar to other code).
Cc: Andrew Morton <[email protected]>
Cc: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: David Hildenbrand <[email protected]>
---
Did a quick test with write-sealing a memfd backed by THP. Seems to work
as it used to.
---
include/linux/mm.h | 9 +--------
mm/memfd.c | 34 ++++++++++++++++++----------------
2 files changed, 19 insertions(+), 24 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6f4825d82965..49e22a2f6ccc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1183,7 +1183,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
* How many times the entire folio is mapped as a single unit (eg by a
* PMD or PUD entry). This is probably not what you want, except for
* debugging purposes - it does not include PTE-mapped sub-pages; look
- * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
+ * at folio_mapcount() or page_mapcount() instead.
*/
static inline int folio_entire_mapcount(struct folio *folio)
{
@@ -1243,13 +1243,6 @@ static inline int folio_mapcount(struct folio *folio)
return folio_total_mapcount(folio);
}
-static inline int total_mapcount(struct page *page)
-{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
- return folio_total_mapcount(page_folio(page));
-}
-
static inline bool folio_large_is_mapped(struct folio *folio)
{
/*
diff --git a/mm/memfd.c b/mm/memfd.c
index d3a1ba4208c9..0a6c1a6ee03b 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -31,24 +31,25 @@
static void memfd_tag_pins(struct xa_state *xas)
{
- struct page *page;
+ struct folio *folio;
int latency = 0;
int cache_count;
lru_add_drain();
xas_lock_irq(xas);
- xas_for_each(xas, page, ULONG_MAX) {
+ xas_for_each(xas, folio, ULONG_MAX) {
cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ !xa_is_value(folio) && folio_test_large(folio) &&
+ !folio_test_hugetlb(folio))
cache_count = HPAGE_PMD_NR;
- if (!xa_is_value(page) &&
- page_count(page) - total_mapcount(page) != cache_count)
+ if (!xa_is_value(folio) && cache_count !=
+ folio_ref_count(folio) - folio_mapcount(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
if (cache_count != 1)
- xas_set(xas, page->index + cache_count);
+ xas_set(xas, folio->index + cache_count);
latency += cache_count;
if (latency < XA_CHECK_SCHED)
@@ -66,16 +67,16 @@ static void memfd_tag_pins(struct xa_state *xas)
/*
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
* via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
* and see whether it has an elevated ref-count. If so, we tag them and wait for
* them to be dropped.
* The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
+ * to those folios to avoid races.
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
int error, scan;
memfd_tag_pins(&xas);
@@ -95,20 +96,21 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_set(&xas, 0);
xas_lock_irq(&xas);
- xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ !xa_is_value(folio) && folio_test_large(folio) &&
+ !folio_test_hugetlb(folio))
cache_count = HPAGE_PMD_NR;
- if (!xa_is_value(page) && cache_count !=
- page_count(page) - total_mapcount(page)) {
+ if (!xa_is_value(folio) && cache_count !=
+ folio_ref_count(folio) - folio_mapcount(folio)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
- * found pages pinned.
+ * found folios pinned.
*/
if (scan == LAST_SCAN)
error = -EBUSY;
--
2.43.2
On 22.02.24 18:13, Matthew Wilcox wrote:
> On Thu, Feb 22, 2024 at 05:09:43PM +0100, David Hildenbrand wrote:
>> We always get a head page, so we can just naturally interpret is as a folio
>> (similar to other code).
>
> memfd seems rather confused about how to iterate over the page cache.
> Perhaps we could sort that out and then delete total_mapcount as a
> second patch?
>
> I haven't tested this at all, but ...
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
>
> diff --git a/mm/memfd.c b/mm/memfd.c
> index d3a1ba4208c9..45e55b0e3cbe 100644
> --- a/mm/memfd.c
> +++ b/mm/memfd.c
> @@ -29,28 +29,29 @@
> #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
> #define LAST_SCAN 4 /* about 150ms max */
>
> +static bool memfd_extra_refs(struct folio *folio)
> +{
> + return folio_ref_count(folio) - folio_mapcount(folio) !=
> + folio_nr_pages(folio);
> +}
That is an obvious improvement I should have realized myself.
Let me play with that.
Thanks!
--
Cheers,
David / dhildenb
On Thu, Feb 22, 2024 at 05:09:43PM +0100, David Hildenbrand wrote:
> We always get a head page, so we can just naturally interpret is as a folio
> (similar to other code).
memfd seems rather confused about how to iterate over the page cache.
Perhaps we could sort that out and then delete total_mapcount as a
second patch?
I haven't tested this at all, but ...
Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
diff --git a/mm/memfd.c b/mm/memfd.c
index d3a1ba4208c9..45e55b0e3cbe 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -29,28 +29,29 @@
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
+static bool memfd_extra_refs(struct folio *folio)
+{
+ return folio_ref_count(folio) - folio_mapcount(folio) !=
+ folio_nr_pages(folio);
+}
+
static void memfd_tag_pins(struct xa_state *xas)
{
- struct page *page;
+ struct folio *folio;
int latency = 0;
- int cache_count;
lru_add_drain();
xas_lock_irq(xas);
- xas_for_each(xas, page, ULONG_MAX) {
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) &&
- page_count(page) - total_mapcount(page) != cache_count)
+ xas_for_each(xas, folio, ULONG_MAX) {
+ /* Can we have shadow/swap entries in memfd? */
+ if (xa_is_value(folio))
+ continue;
+
+ if (memfd_extra_refs(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
- if (cache_count != 1)
- xas_set(xas, page->index + cache_count);
- latency += cache_count;
+ latency++;
if (latency < XA_CHECK_SCHED)
continue;
latency = 0;
@@ -75,7 +76,6 @@ static void memfd_tag_pins(struct xa_state *xas)
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
int error, scan;
memfd_tag_pins(&xas);
@@ -83,7 +83,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
- int cache_count;
+ struct folio *folio;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -95,16 +95,10 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_set(&xas, 0);
xas_lock_irq(&xas);
- xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) && cache_count !=
- page_count(page) - total_mapcount(page)) {
+ if (memfd_extra_refs(folio)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
@@ -118,8 +112,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;