LinuxLists.cc - [RFC][PATCH 5/9] mm/migrate: demote pages during reclaim

2020-10-07 16:26:42

Subject: [RFC][PATCH 5/9] mm/migrate: demote pages during reclaim

From: Dave Hansen <[email protected]>

This is mostly derived from a patch from Yang Shi:

https://lore.kernel.org/linux-mm/[email protected]/

Add code to the reclaim path (shrink_page_list()) to "demote" data
to another NUMA node instead of discarding the data. This always
avoids the cost of I/O needed to read the page back in and sometimes
avoids the writeout cost when the pagee is dirty.

A second pass through shrink_page_list() will be made if any demotions
fail. This essentally falls back to normal reclaim behavior in the
case that demotions fail. Previous versions of this patch may have
simply failed to reclaim pages which were eligible for demotion but
were unable to be demoted in practice.

Note: This just adds the start of infratructure for migration. It is
actually disabled next to the FIXME in migrate_demote_page_ok().

Signed-off-by: Dave Hansen <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Huang Ying <[email protected]>
Cc: Dan Williams <[email protected]>

--

changes from 20200730:
* Add another pass through shrink_page_list() when demotion
fails.
---

b/include/linux/migrate.h | 2
b/mm/vmscan.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 99 insertions(+)

diff -puN include/linux/migrate.h~demote-with-migrate_pages include/linux/migrate.h
--- a/include/linux/migrate.h~demote-with-migrate_pages 2020-10-07 09:15:31.028642442 -0700
+++ b/include/linux/migrate.h 2020-10-07 09:15:31.034642442 -0700
@@ -27,6 +27,7 @@ enum migrate_reason {
MR_MEMPOLICY_MBIND,
MR_NUMA_MISPLACED,
MR_CONTIG_RANGE,
+ MR_DEMOTION,
MR_TYPES
};

@@ -196,6 +197,7 @@ struct migrate_vma {
int migrate_vma_setup(struct migrate_vma *args);
void migrate_vma_pages(struct migrate_vma *migrate);
void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);

#endif /* CONFIG_MIGRATION */

diff -puN mm/vmscan.c~demote-with-migrate_pages mm/vmscan.c
--- a/mm/vmscan.c~demote-with-migrate_pages 2020-10-07 09:15:31.030642442 -0700
+++ b/mm/vmscan.c 2020-10-07 09:15:31.037642442 -0700
@@ -43,6 +43,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
@@ -1034,6 +1035,24 @@ static enum page_references page_check_r
return PAGEREF_RECLAIM;
}

+bool migrate_demote_page_ok(struct page *page, struct scan_control *sc)
+{
+ int next_nid = next_demotion_node(page_to_nid(page));
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+
+ if (next_nid == NUMA_NO_NODE)
+ return false;
+ if (PageTransHuge(page) && !thp_migration_supported())
+ return false;
+
+ // FIXME: actually enable this later in the series
+ return false;
+}
+
+
/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page,
bool *dirty, bool *writeback)
@@ -1064,6 +1083,60 @@ static void page_check_dirty_writeback(s
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}

+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+ /*
+ * Try to fail quickly if memory on the target node is not
+ * available. Leaving out __GFP_IO and __GFP_FS helps with
+ * this. If the desintation node is full, we want kswapd to
+ * run there so that its pages will get reclaimed and future
+ * migration attempts may succeed.
+ */
+ gfp_t flags = (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_NORETRY |
+ __GFP_NOMEMALLOC | __GFP_NOWARN | __GFP_THISNODE |
+ __GFP_KSWAPD_RECLAIM);
+ /* HugeTLB pages should not be on the LRU */
+ WARN_ON_ONCE(PageHuge(page));
+
+ if (PageTransHuge(page)) {
+ struct page *thp;
+
+ flags |= __GFP_COMP;
+
+ thp = alloc_pages_node(node, flags, HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+
+ return __alloc_pages_node(node, flags, 0);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node. Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+ struct pglist_data *pgdat,
+ struct scan_control *sc)
+{
+ int target_nid = next_demotion_node(pgdat->node_id);
+ unsigned int nr_succeeded = 0;
+ int err;
+
+ if (list_empty(demote_pages))
+ return 0;
+
+ /* Demotion ignores all cpuset and mempolicy settings */
+ err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
+
+ return nr_succeeded;
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1076,12 +1149,15 @@ static unsigned int shrink_page_list(str
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
+ LIST_HEAD(demote_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
+ bool do_demote_pass = true;

memset(stat, 0, sizeof(*stat));
cond_resched();

+retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
@@ -1231,6 +1307,16 @@ static unsigned int shrink_page_list(str
}

/*
+ * Before reclaiming the page, try to relocate
+ * its contents to another node.
+ */
+ if (do_demote_pass && migrate_demote_page_ok(page, sc)) {
+ list_add(&page->lru, &demote_pages);
+ unlock_page(page);
+ continue;
+ }
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
@@ -1477,6 +1563,17 @@ keep:
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ /* 'page_list' is always empty here */
+
+ /* Migrate pages selected for demotion */
+ nr_reclaimed += demote_page_list(&demote_pages, pgdat, sc);
+ /* Pages that could not be demoted are still in @demote_pages */
+ if (!list_empty(&demote_pages)) {
+ /* Pages which failed to demoted go back on on @page_list for retry: */
+ list_splice_init(&demote_pages, page_list);
+ do_demote_pass = false;
+ goto retry;
+ }

pgactivate = stat->nr_activate[0] + stat->nr_activate[1];

_

2020-10-28 13:57:47

by Oscar Salvador

[permalink] [raw]

Subject: Re: [RFC][PATCH 5/9] mm/migrate: demote pages during reclaim

On Wed, Oct 07, 2020 at 09:17:45AM -0700, Dave Hansen wrote:
> Signed-off-by: Dave Hansen <[email protected]>
> Cc: Yang Shi <[email protected]>
> Cc: David Rientjes <[email protected]>
> Cc: Huang Ying <[email protected]>
> Cc: Dan Williams <[email protected]>

I am still going through all the details, but just my thoughts on things
that caught my eye:

> --- a/include/linux/migrate.h~demote-with-migrate_pages 2020-10-07 09:15:31.028642442 -0700
> +++ b/include/linux/migrate.h 2020-10-07 09:15:31.034642442 -0700
> @@ -27,6 +27,7 @@ enum migrate_reason {
> MR_MEMPOLICY_MBIND,
> MR_NUMA_MISPLACED,
> MR_CONTIG_RANGE,
> + MR_DEMOTION,
> MR_TYPES

I think you also need to add it under include/trace/events/migrate.h, so
mm_migrate_pages event can know about it.

> +bool migrate_demote_page_ok(struct page *page, struct scan_control *sc)

Make it static?
Also, scan_control seems to be unused here.

> +{
> + int next_nid = next_demotion_node(page_to_nid(page));
> +
> + VM_BUG_ON_PAGE(!PageLocked(page), page);

Right after the call to migrate_demote_page_ok, we call unlock_page
which already has this check in place.
I know that this is only to be on the safe side and we do not loss anything,
but just my thoughts.

> +static struct page *alloc_demote_page(struct page *page, unsigned long node)
> +{
> + /*
> + * Try to fail quickly if memory on the target node is not
> + * available. Leaving out __GFP_IO and __GFP_FS helps with
> + * this. If the desintation node is full, we want kswapd to
> + * run there so that its pages will get reclaimed and future
> + * migration attempts may succeed.
> + */
> + gfp_t flags = (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_NORETRY |
> + __GFP_NOMEMALLOC | __GFP_NOWARN | __GFP_THISNODE |
> + __GFP_KSWAPD_RECLAIM);

I think it would be nicer to have this as a real GFP_ thingy defined.
e.g: GFP_DEMOTION

> + /* HugeTLB pages should not be on the LRU */
> + WARN_ON_ONCE(PageHuge(page));

I am not sure about this one.
This could only happen if the page, which now it is in another list, ends up in
the buddy system. That is quite unlikely bth.
And nevertheless, this is only a warning, which means that if this scenario gets
to happen, we will be allocating a single page to satisfy a higher-order page, and
I am not sure about the situation we will end up with.

> +
> + if (PageTransHuge(page)) {
> + struct page *thp;
> +
> + flags |= __GFP_COMP;
> +
> + thp = alloc_pages_node(node, flags, HPAGE_PMD_ORDER);
> + if (!thp)
> + return NULL;
> + prep_transhuge_page(thp);
> + return thp;
> + }
> +
> + return __alloc_pages_node(node, flags, 0);

Would make sense to transform this in some sort of new_demotion_page,
which actually calls alloc_migration_target with the right stuff in place?
And then pass a struct migration_target_control so alloc_migration_target
does the right thing.
alloc_migration_target also takes care of calling prep_transhuge_page
when needed.
e.g:

static struct page *new_demotion_node(struct page *page, unsigned long private)
{
struct migration_target_control mtc = {
.nid = private,
.gfp_mask = GFP_DEMOTION,
};

if (PageTransHuge(page))
mtc.gfp_mask |= __GFP_COMP;

return alloc_migration_target(page, (unsigned long)&mtc);
}

The only thing I see is that alloc_migration_target seems to "override"
the gfp_mask and does ORs GFP_TRANSHUGE for THP pages, which includes
__GFP_DIRECT_RECLAIM (not appreciated in this case).
But maybe this can be worked around by checking if gfp_mask == GFP_DEMOTION,
and if so, just keep the mask as it is.

> +
> + if (list_empty(demote_pages))
> + return 0;
> +
> + /* Demotion ignores all cpuset and mempolicy settings */
> + err = migrate_pages(demote_pages, alloc_demote_page, NULL,
> + target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> + &nr_succeeded);

As I said, instead of alloc_demote_page, use a new_demote_page and make
alloc_migration_target handle the allocations and prep thp pages.

--
Oscar Salvador
SUSE L3

2020-10-28 15:59:48

by Yang Shi

[permalink] [raw]

Subject: Re: [RFC][PATCH 5/9] mm/migrate: demote pages during reclaim

On Tue, Oct 27, 2020 at 8:29 AM Oscar Salvador <[email protected]> wrote:
>
> On Wed, Oct 07, 2020 at 09:17:45AM -0700, Dave Hansen wrote:
> > Signed-off-by: Dave Hansen <[email protected]>
> > Cc: Yang Shi <[email protected]>
> > Cc: David Rientjes <[email protected]>
> > Cc: Huang Ying <[email protected]>
> > Cc: Dan Williams <[email protected]>
>
> I am still going through all the details, but just my thoughts on things
> that caught my eye:
>
> > --- a/include/linux/migrate.h~demote-with-migrate_pages 2020-10-07 09:15:31.028642442 -0700
> > +++ b/include/linux/migrate.h 2020-10-07 09:15:31.034642442 -0700
> > @@ -27,6 +27,7 @@ enum migrate_reason {
> > MR_MEMPOLICY_MBIND,
> > MR_NUMA_MISPLACED,
> > MR_CONTIG_RANGE,
> > + MR_DEMOTION,
> > MR_TYPES
>
> I think you also need to add it under include/trace/events/migrate.h, so
> mm_migrate_pages event can know about it.

Agree.

>
> > +bool migrate_demote_page_ok(struct page *page, struct scan_control *sc)
>
> Make it static?
> Also, scan_control seems to be unused here.
>
> > +{
> > + int next_nid = next_demotion_node(page_to_nid(page));
> > +
> > + VM_BUG_ON_PAGE(!PageLocked(page), page);
>
> Right after the call to migrate_demote_page_ok, we call unlock_page
> which already has this check in place.
> I know that this is only to be on the safe side and we do not loss anything,
> but just my thoughts.
>
> > +static struct page *alloc_demote_page(struct page *page, unsigned long node)
> > +{
> > + /*
> > + * Try to fail quickly if memory on the target node is not
> > + * available. Leaving out __GFP_IO and __GFP_FS helps with
> > + * this. If the desintation node is full, we want kswapd to
> > + * run there so that its pages will get reclaimed and future
> > + * migration attempts may succeed.
> > + */
> > + gfp_t flags = (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_NORETRY |
> > + __GFP_NOMEMALLOC | __GFP_NOWARN | __GFP_THISNODE |
> > + __GFP_KSWAPD_RECLAIM);
>
> I think it would be nicer to have this as a real GFP_ thingy defined.
> e.g: GFP_DEMOTION
>
> > + /* HugeTLB pages should not be on the LRU */
> > + WARN_ON_ONCE(PageHuge(page));
>
> I am not sure about this one.
> This could only happen if the page, which now it is in another list, ends up in
> the buddy system. That is quite unlikely bth.
> And nevertheless, this is only a warning, which means that if this scenario gets
> to happen, we will be allocating a single page to satisfy a higher-order page, and
> I am not sure about the situation we will end up with.

IMHO, we should use BUG_ON instead of WARN_ON or we should just back
off if we see hugetlb page in this path and print out some warning.

>
> > +
> > + if (PageTransHuge(page)) {
> > + struct page *thp;
> > +
> > + flags |= __GFP_COMP;
> > +
> > + thp = alloc_pages_node(node, flags, HPAGE_PMD_ORDER);
> > + if (!thp)
> > + return NULL;
> > + prep_transhuge_page(thp);
> > + return thp;
> > + }
> > +
> > + return __alloc_pages_node(node, flags, 0);
>
> Would make sense to transform this in some sort of new_demotion_page,
> which actually calls alloc_migration_target with the right stuff in place?
> And then pass a struct migration_target_control so alloc_migration_target
> does the right thing.
> alloc_migration_target also takes care of calling prep_transhuge_page
> when needed.
> e.g:
>
> static struct page *new_demotion_node(struct page *page, unsigned long private)
> {
> struct migration_target_control mtc = {
> .nid = private,
> .gfp_mask = GFP_DEMOTION,
> };
>
> if (PageTransHuge(page))
> mtc.gfp_mask |= __GFP_COMP;
>
> return alloc_migration_target(page, (unsigned long)&mtc);
> }
>
> The only thing I see is that alloc_migration_target seems to "override"
> the gfp_mask and does ORs GFP_TRANSHUGE for THP pages, which includes
> __GFP_DIRECT_RECLAIM (not appreciated in this case).
> But maybe this can be worked around by checking if gfp_mask == GFP_DEMOTION,
> and if so, just keep the mask as it is.

Makes sense to me.

>
> > +
> > + if (list_empty(demote_pages))
> > + return 0;
> > +
> > + /* Demotion ignores all cpuset and mempolicy settings */
> > + err = migrate_pages(demote_pages, alloc_demote_page, NULL,
> > + target_nid, MIGRATE_ASYNC, MR_DEMOTION,
> > + &nr_succeeded);
>
> As I said, instead of alloc_demote_page, use a new_demote_page and make
> alloc_migration_target handle the allocations and prep thp pages.
>
>
> --
> Oscar Salvador
> SUSE L3
>