2022-07-11 09:10:51

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -V2 0/7] migrate_pages(): fix several bugs in error path

From: "Huang, Ying" <[email protected]>

During review the code of migrate_pages() and build a test program for
it. Several bugs in error path are identified and fixed in this
series.

Most patches are tested via

- Apply error-inject.patch in Linux kernel
- Compile test-migrate.c (with -lnuma)
- Test with test-migrate.sh

error-inject.patch, test-migrate.c, and test-migrate.sh are as below.
It turns out that error injection is an important tool to fix bugs in
error path.

Changes:

v2:

- Rebased on v5.19-rc5

- Addressed some comments from Baolin, Thanks!

- Added reviewed-by tags

Best Regards,
Huang, Ying

------------------------- error-inject.patch -------------------------
From 295ea21204f3f025a041fe39c68a2eaec8313c68 Mon Sep 17 00:00:00 2001
From: Huang Ying <[email protected]>
Date: Tue, 21 Jun 2022 11:08:30 +0800
Subject: [PATCH] migrate_pages: error inject

---
mm/migrate.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 55 insertions(+), 3 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 399904015d23..87d47064ec6c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -337,6 +337,42 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
}
#endif

+#define EI_MP_ENOSYS 0x0001
+#define EI_MP_THP_ENOMEM 0x0002
+#define EI_MP_NP_ENOMEM 0x0004
+#define EI_MP_EAGAIN 0x0008
+#define EI_MP_EOTHER 0x0010
+#define EI_MP_NOSPLIT 0x0020
+#define EI_MP_SPLIT_FAIL 0x0040
+#define EI_MP_EAGAIN_PERM 0x0080
+#define EI_MP_EBUSY 0x0100
+
+static unsigned int ei_migrate_pages;
+
+module_param(ei_migrate_pages, uint, 0644);
+
+static bool ei_thp_migration_supported(void)
+{
+ if (ei_migrate_pages & EI_MP_ENOSYS)
+ return false;
+ else
+ return thp_migration_supported();
+}
+
+static int ei_trylock_page(struct page *page)
+{
+ if (ei_migrate_pages & EI_MP_EAGAIN)
+ return 0;
+ return trylock_page(page);
+}
+
+static int ei_split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ if (ei_migrate_pages & EI_MP_SPLIT_FAIL)
+ return -EBUSY;
+ return split_huge_page_to_list(page, list);
+}
+
static int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;
@@ -368,6 +404,9 @@ int folio_migrate_mapping(struct address_space *mapping,
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;

+ if (ei_migrate_pages & EI_MP_EAGAIN_PERM)
+ return -EAGAIN;
+
/* No turning back from here */
newfolio->index = folio->index;
newfolio->mapping = folio->mapping;
@@ -929,7 +968,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
struct anon_vma *anon_vma = NULL;
bool is_lru = !__PageMovable(page);

- if (!trylock_page(page)) {
+ if (!ei_trylock_page(page)) {
if (!force || mode == MIGRATE_ASYNC)
goto out;

@@ -952,6 +991,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
lock_page(page);
}

+ if (ei_migrate_pages & EI_MP_EBUSY) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageWriteback(page)) {
/*
* Only in the case of a full synchronous migration is it
@@ -1086,7 +1130,7 @@ static int unmap_and_move(new_page_t get_new_page,
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage = NULL;

- if (!thp_migration_supported() && PageTransHuge(page))
+ if (!ei_thp_migration_supported() && PageTransHuge(page))
return -ENOSYS;

if (page_count(page) == 1) {
@@ -1102,6 +1146,11 @@ static int unmap_and_move(new_page_t get_new_page,
goto out;
}

+ if ((ei_migrate_pages & EI_MP_THP_ENOMEM) && PageTransHuge(page))
+ return -ENOMEM;
+ if ((ei_migrate_pages & EI_MP_NP_ENOMEM) && !PageTransHuge(page))
+ return -ENOMEM;
+
newpage = get_new_page(page, private);
if (!newpage)
return -ENOMEM;
@@ -1305,7 +1354,7 @@ static inline int try_split_thp(struct page *page, struct list_head *split_pages
int rc;

lock_page(page);
- rc = split_huge_page_to_list(page, split_pages);
+ rc = ei_split_huge_page_to_list(page, split_pages);
unlock_page(page);

return rc;
@@ -1358,6 +1407,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
bool nosplit = (reason == MR_NUMA_MISPLACED);
bool no_subpage_counting = false;

+ if (ei_migrate_pages & EI_MP_NOSPLIT)
+ nosplit = true;
+
trace_mm_migrate_pages_start(mode, reason);

thp_subpage_migration:
--
2.30.2

------------------------- test-migrate.c -------------------------------------
#define _GNU_SOURCE

#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>

#include <fcntl.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <unistd.h>
#include <numaif.h>
#include <numa.h>

#ifndef MADV_FREE
#define MADV_FREE 8 /* free pages only if memory pressure */
#endif

#define ONE_MB (1024 * 1024)
#define MAP_SIZE (16 * ONE_MB)
#define THP_SIZE (2 * ONE_MB)
#define THP_MASK (THP_SIZE - 1)

#define ERR_EXIT_ON(cond, msg) \
do { \
int __cond_in_macro = (cond); \
if (__cond_in_macro) \
error_exit(__cond_in_macro, (msg)); \
} while (0)

void error_msg(int ret, int nr, int *status, const char *msg)
{
int i;

fprintf(stderr, "Error: %s, ret : %d, error: %s\n",
msg, ret, strerror(errno));

if (!nr)
return;
fprintf(stderr, "status: ");
for (i = 0; i < nr; i++)
fprintf(stderr, "%d ", status[i]);
fprintf(stderr, "\n");
}

void error_exit(int ret, const char *msg)
{
error_msg(ret, 0, NULL, msg);
exit(1);
}

void *addr_thp;
void *addr;
char *pn;
char *pn1;
char *pn2;
char *pn3;
void *pages[4];
int status[4];

void create_map(bool thp)
{
int ret;
void *p;

p = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ERR_EXIT_ON(p == MAP_FAILED, "mmap");
if (thp) {
ret = madvise(p, MAP_SIZE, MADV_HUGEPAGE);
ERR_EXIT_ON(ret, "advise hugepage");
addr_thp = p;
} else {
addr = p;
}
}

void prepare()
{
int ret;
struct iovec iov;

if (addr) {
munmap(addr_thp, MAP_SIZE);
munmap(addr, MAP_SIZE);
}

create_map(true);
create_map(false);

pn = (char *)(((unsigned long)addr_thp + THP_SIZE) & ~THP_MASK);
pn1 = pn + THP_SIZE;
pages[0] = pn;
pages[1] = pn1;
*pn = 1;

pn2 = (char *)(((unsigned long)addr + THP_SIZE) & ~THP_MASK);
pn3 = pn2 + THP_SIZE;
pages[2] = pn2;
pages[3] = pn3;

status[0] = status[1] = status[2] = status[3] = 1024;
}

void test_migrate()
{
int ret;
int nodes[4] = { 1, 1, 1, 1 };
pid_t pid = getpid();

prepare();
*pn1 = 1;
*pn2 = 1;
*pn3 = 1;
ret = move_pages(pid, 4, pages, nodes, status, MPOL_MF_MOVE_ALL);
error_msg(ret, 4, status, "move 4 pages");
}

int main(int argc, char *argv[])
{
numa_run_on_node(0);

test_migrate();

return 0;
}
--------------------- test-migrate.sh ----------------------------
#!/bin/bash

PARAM=/sys/module/migrate/parameters/ei_migrate_pages

get_vmstat()
{
echo ================= $* ================
cat /proc/vmstat | grep -e '\(pgmigrate\|thp_migration\)'
}

simple_test()
{
echo $1 > $PARAM
shift
get_vmstat before $*
./test-migrate
get_vmstat after $*
}

#define EI_MP_ENOSYS 0x0001
#define EI_MP_THP_ENOMEM 0x0002
#define EI_MP_NP_ENOMEM 0x0004
#define EI_MP_EAGAIN 0x0008
#define EI_MP_EOTHER 0x0010
#define EI_MP_NOSPLIT 0x0020
#define EI_MP_SPLIT_FAIL 0x0040
#define EI_MP_EAGAIN_PERM 0x0080
#define EI_MP_EBUSY 0x0100

simple_test 0x26 ENOMEM
simple_test 0x81 retry THP subpages
simple_test 0xc1 ENOSYS
simple_test 0x101 ENOSYS


2022-07-11 10:06:39

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -V2 6/7] migrate_pages(): fix failure counting for THP splitting

If THP is failed to be migrated, it may be split and retry. But after
splitting, the head page will be left in "from" list, although THP
migration failure has been counted already. If the head page is
failed to be migrated too, the failure will be counted twice
incorrectly. So this is fixed in this patch via moving the head page
of THP after splitting to "thp_split_pages" too.

Signed-off-by: "Huang, Ying" <[email protected]>
Fixes: 5984fabb6e82 ("mm: move_pages: report the number of non-attempted pages")
Reviewed-by: Baolin Wang <[email protected]>
Cc: Zi Yan <[email protected]>
Cc: Yang Shi <[email protected]>
---
mm/migrate.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 8cce73b7c046..557708ce13a1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1306,6 +1306,8 @@ static inline int try_split_thp(struct page *page, struct list_head *split_pages
lock_page(page);
rc = split_huge_page_to_list(page, split_pages);
unlock_page(page);
+ if (!rc)
+ list_move_tail(&page->lru, split_pages);

return rc;
}
@@ -1365,7 +1367,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
thp_retry = 0;

list_for_each_entry_safe(page, page2, from, lru) {
-retry:
/*
* THP statistics is based on the source huge page.
* Capture required information that might get lost
@@ -1412,7 +1413,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
nr_thp_failed++;
if (!try_split_thp(page, &thp_split_pages)) {
nr_thp_split++;
- goto retry;
+ break;
}
/* Hugetlb migration is unsupported */
} else if (!no_subpage_counting) {
@@ -1432,7 +1433,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
/* THP NUMA faulting doesn't split THP to retry. */
if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
nr_thp_split++;
- goto retry;
+ break;
}
} else if (!no_subpage_counting) {
nr_failed++;
--
2.30.2

2022-07-11 10:20:16

by Huang, Ying

[permalink] [raw]
Subject: [PATCH -V2 1/7] migrate: fix syscall move_pages() return value for failure

The return value of move_pages() syscall is incorrect when counting
the remaining pages to be migrated. For example, for the following
test program,

"
#define _GNU_SOURCE

#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>

#include <fcntl.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <unistd.h>
#include <numaif.h>
#include <numa.h>

#ifndef MADV_FREE
#define MADV_FREE 8 /* free pages only if memory pressure */
#endif

#define ONE_MB (1024 * 1024)
#define MAP_SIZE (16 * ONE_MB)
#define THP_SIZE (2 * ONE_MB)
#define THP_MASK (THP_SIZE - 1)

#define ERR_EXIT_ON(cond, msg) \
do { \
int __cond_in_macro = (cond); \
if (__cond_in_macro) \
error_exit(__cond_in_macro, (msg)); \
} while (0)

void error_msg(int ret, int nr, int *status, const char *msg)
{
int i;

fprintf(stderr, "Error: %s, ret : %d, error: %s\n",
msg, ret, strerror(errno));

if (!nr)
return;
fprintf(stderr, "status: ");
for (i = 0; i < nr; i++)
fprintf(stderr, "%d ", status[i]);
fprintf(stderr, "\n");
}

void error_exit(int ret, const char *msg)
{
error_msg(ret, 0, NULL, msg);
exit(1);
}

int page_size;

bool do_vmsplice;
bool do_thp;

static int pipe_fds[2];
void *addr;
char *pn;
char *pn1;
void *pages[2];
int status[2];

void prepare()
{
int ret;
struct iovec iov;

if (addr) {
munmap(addr, MAP_SIZE);
close(pipe_fds[0]);
close(pipe_fds[1]);
}

ret = pipe(pipe_fds);
ERR_EXIT_ON(ret, "pipe");

addr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ERR_EXIT_ON(addr == MAP_FAILED, "mmap");
if (do_thp) {
ret = madvise(addr, MAP_SIZE, MADV_HUGEPAGE);
ERR_EXIT_ON(ret, "advise hugepage");
}

pn = (char *)(((unsigned long)addr + THP_SIZE) & ~THP_MASK);
pn1 = pn + THP_SIZE;
pages[0] = pn;
pages[1] = pn1;
*pn = 1;

if (do_vmsplice) {
iov.iov_base = pn;
iov.iov_len = page_size;
ret = vmsplice(pipe_fds[1], &iov, 1, 0);
ERR_EXIT_ON(ret < 0, "vmsplice");
}

status[0] = status[1] = 1024;
}

void test_migrate()
{
int ret;
int nodes[2] = { 1, 1 };
pid_t pid = getpid();

prepare();
ret = move_pages(pid, 1, pages, nodes, status, MPOL_MF_MOVE_ALL);
error_msg(ret, 1, status, "move 1 page");

prepare();
ret = move_pages(pid, 2, pages, nodes, status, MPOL_MF_MOVE_ALL);
error_msg(ret, 2, status, "move 2 pages, page 1 not mapped");

prepare();
*pn1 = 1;
ret = move_pages(pid, 2, pages, nodes, status, MPOL_MF_MOVE_ALL);
error_msg(ret, 2, status, "move 2 pages");

prepare();
*pn1 = 1;
nodes[1] = 0;
ret = move_pages(pid, 2, pages, nodes, status, MPOL_MF_MOVE_ALL);
error_msg(ret, 2, status, "move 2 pages, page 1 to node 0");
}

int main(int argc, char *argv[])
{
numa_run_on_node(0);
page_size = getpagesize();

test_migrate();

fprintf(stderr, "\nMake page 0 cannot be migrated:\n");
do_vmsplice = true;
test_migrate();

fprintf(stderr, "\nTest THP:\n");
do_thp = true;
do_vmsplice = false;
test_migrate();

fprintf(stderr, "\nTHP: make page 0 cannot be migrated:\n");
do_vmsplice = true;
test_migrate();

return 0;
}
"

The output of the current kernel is,

"
Error: move 1 page, ret : 0, error: Success
status: 1
Error: move 2 pages, page 1 not mapped, ret : 0, error: Success
status: 1 -14
Error: move 2 pages, ret : 0, error: Success
status: 1 1
Error: move 2 pages, page 1 to node 0, ret : 0, error: Success
status: 1 0

Make page 0 cannot be migrated:
Error: move 1 page, ret : 0, error: Success
status: 1024
Error: move 2 pages, page 1 not mapped, ret : 1, error: Success
status: 1024 -14
Error: move 2 pages, ret : 0, error: Success
status: 1024 1024
Error: move 2 pages, page 1 to node 0, ret : 1, error: Success
status: 1024 1024
"

While the expected output is,

"
Error: move 1 page, ret : 0, error: Success
status: 1
Error: move 2 pages, page 1 not mapped, ret : 0, error: Success
status: 1 -14
Error: move 2 pages, ret : 0, error: Success
status: 1 1
Error: move 2 pages, page 1 to node 0, ret : 0, error: Success
status: 1 0

Make page 0 cannot be migrated:
Error: move 1 page, ret : 1, error: Success
status: 1024
Error: move 2 pages, page 1 not mapped, ret : 1, error: Success
status: 1024 -14
Error: move 2 pages, ret : 1, error: Success
status: 1024 1024
Error: move 2 pages, page 1 to node 0, ret : 2, error: Success
status: 1024 1024
"

Fix this via correcting the remaining pages counting. With the fix,
the output for the test program as above is expected.

Signed-off-by: "Huang, Ying" <[email protected]>
Fixes: 5984fabb6e82 ("mm: move_pages: report the number of non-attempted pages")
Cc: Baolin Wang <[email protected]>
Cc: Zi Yan <[email protected]>
Cc: Yang Shi <[email protected]>
---
mm/migrate.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 6c1ea61f39d8..472335f0aaa3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1682,7 +1682,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
* well.
*/
if (err > 0)
- err += nr_pages - i - 1;
+ err += nr_pages - i;
return err;
}
return store_status(status, start, node, i - start);
@@ -1768,8 +1768,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,

err = move_pages_and_store_status(mm, current_node, &pagelist,
status, start, i, nr_pages);
- if (err)
+ if (err) {
+ /* We have stored status of page i */
+ if (err > 0)
+ err--;
goto out;
+ }
current_node = NUMA_NO_NODE;
}
out_flush:
--
2.30.2

2022-07-11 12:41:04

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH -V2 6/7] migrate_pages(): fix failure counting for THP splitting

On Mon, Jul 11, 2022 at 04:49:47PM +0800, Huang Ying wrote:
> If THP is failed to be migrated, it may be split and retry. But after
> splitting, the head page will be left in "from" list, although THP
> migration failure has been counted already. If the head page is
> failed to be migrated too, the failure will be counted twice
> incorrectly. So this is fixed in this patch via moving the head page
> of THP after splitting to "thp_split_pages" too.
>
> Signed-off-by: "Huang, Ying" <[email protected]>
> Fixes: 5984fabb6e82 ("mm: move_pages: report the number of non-attempted pages")
> Reviewed-by: Baolin Wang <[email protected]>
> Cc: Zi Yan <[email protected]>
> Cc: Yang Shi <[email protected]>

Reviewed-by: Oscar Salvador <[email protected]>

> ---
> mm/migrate.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 8cce73b7c046..557708ce13a1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1306,6 +1306,8 @@ static inline int try_split_thp(struct page *page, struct list_head *split_pages
> lock_page(page);
> rc = split_huge_page_to_list(page, split_pages);
> unlock_page(page);
> + if (!rc)
> + list_move_tail(&page->lru, split_pages);
>
> return rc;
> }
> @@ -1365,7 +1367,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> thp_retry = 0;
>
> list_for_each_entry_safe(page, page2, from, lru) {
> -retry:
> /*
> * THP statistics is based on the source huge page.
> * Capture required information that might get lost
> @@ -1412,7 +1413,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> nr_thp_failed++;
> if (!try_split_thp(page, &thp_split_pages)) {
> nr_thp_split++;
> - goto retry;
> + break;
> }
> /* Hugetlb migration is unsupported */
> } else if (!no_subpage_counting) {
> @@ -1432,7 +1433,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
> /* THP NUMA faulting doesn't split THP to retry. */
> if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
> nr_thp_split++;
> - goto retry;
> + break;
> }
> } else if (!no_subpage_counting) {
> nr_failed++;
> --
> 2.30.2
>
>

--
Oscar Salvador
SUSE Labs

2022-07-11 12:52:14

by Oscar Salvador

[permalink] [raw]
Subject: Re: [PATCH -V2 1/7] migrate: fix syscall move_pages() return value for failure

On Mon, Jul 11, 2022 at 04:49:42PM +0800, Huang Ying wrote:
> return store_status(status, start, node, i - start);
> @@ -1768,8 +1768,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
>
> err = move_pages_and_store_status(mm, current_node, &pagelist,
> status, start, i, nr_pages);
> - if (err)
> + if (err) {
> + /* We have stored status of page i */

IMHO, "We have accounted for page i" gives a more clear statement.

Reviewed-by: Oscar Salvador <[email protected]>

> + if (err > 0)
> + err--;
> goto out;
> + }
> current_node = NUMA_NO_NODE;
> }
> out_flush:
> --
> 2.30.2
>
>

--
Oscar Salvador
SUSE Labs

2022-07-12 01:56:39

by Huang, Ying

[permalink] [raw]
Subject: Re: [PATCH -V2 1/7] migrate: fix syscall move_pages() return value for failure

Oscar Salvador <[email protected]> writes:

> On Mon, Jul 11, 2022 at 04:49:42PM +0800, Huang Ying wrote:
>> return store_status(status, start, node, i - start);
>> @@ -1768,8 +1768,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
>>
>> err = move_pages_and_store_status(mm, current_node, &pagelist,
>> status, start, i, nr_pages);
>> - if (err)
>> + if (err) {
>> + /* We have stored status of page i */
>
> IMHO, "We have accounted for page i" gives a more clear statement.

Sure. Will change this.

> Reviewed-by: Oscar Salvador <[email protected]>

Thanks!

Best Regards,
Huang, Ying

>> + if (err > 0)
>> + err--;
>> goto out;
>> + }
>> current_node = NUMA_NO_NODE;
>> }
>> out_flush:
>> --
>> 2.30.2
>>
>>