Today when hardware memory is corrupted in a hugetlb hugepage,
kernel leaves the hugepage in pagecache [1]; otherwise future mmap or
read will suject to silent data corruption. This is implemented by
returning -EIO from hugetlb_read_iter immediately if the hugepage has
HWPOISON flag set.
Since memory_failure already tracks the raw HWPOISON subpages in a
hugepage, a natural improvement is possible: if userspace only asks for
healthy subpages in the pagecache, kernel can return these data.
This patchset implements this improvement. The 1st commit fixes an issue
in __folio_free_raw_hwp. The 2nd commit exports the functionality to
tell if a subpage inside a hugetlb hugepage is a raw HWPOISON page.
The 3rd commit teaches hugetlbfs_read_iter to return as many healthy
bytes as possible. The last commit properly tests this new feature.
[1] commit 8625147cafaa ("hugetlbfs: don't delete error page from pagecache")
Changelog
v1 => v2
* __folio_free_raw_hwp deletes all entries in raw_hwp_list before it
traverses and frees raw_hwp_page.
* find_raw_hwp_page => __is_raw_hwp_subpage and __is_raw_hwp_subpage
only returns bool instead of a raw_hwp_page entry.
* is_raw_hwp_subpage holds hugetlb_lock while checking
__is_raw_hwp_subpage.
* No need to do folio_lock in adjust_range_hwpoison.
* v2 is based on commit a6e79df92e4a ("mm/gup: disallow FOLL_LONGTERM
GUP-fast writing to file-backed mappings")
Jiaqi Yan (4):
mm/hwpoison: delete all entries before traversal in
__folio_free_raw_hwp
mm/hwpoison: check if a subpage of a hugetlb folio is raw HWPOISON
hugetlbfs: improve read HWPOISON hugepage
selftests/mm: add tests for HWPOISON hugetlbfs read
fs/hugetlbfs/inode.c | 58 +++-
include/linux/hugetlb.h | 19 ++
include/linux/mm.h | 7 +
mm/hugetlb.c | 10 +
mm/memory-failure.c | 42 ++-
tools/testing/selftests/mm/.gitignore | 1 +
tools/testing/selftests/mm/Makefile | 1 +
.../selftests/mm/hugetlb-read-hwpoison.c | 322 ++++++++++++++++++
8 files changed, 439 insertions(+), 21 deletions(-)
create mode 100644 tools/testing/selftests/mm/hugetlb-read-hwpoison.c
--
2.41.0.162.gfafddb0af9-goog
Adds the functionality to tell if a subpage of a hugetlb folio is a
raw HWPOISON page. This functionality relies on RawHwpUnreliable to
be not set; otherwise hugepage's HWPOISON list becomes meaningless.
Exports this functionality to be immediately used in the read operation
for hugetlbfs.
Signed-off-by: Jiaqi Yan <[email protected]>
---
include/linux/hugetlb.h | 19 +++++++++++++++++++
include/linux/mm.h | 7 +++++++
mm/hugetlb.c | 10 ++++++++++
mm/memory-failure.c | 34 ++++++++++++++++++++++++----------
4 files changed, 60 insertions(+), 10 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 21f942025fec..8b73a12b7b38 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1013,6 +1013,25 @@ void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
+ */
+struct raw_hwp_page {
+ struct llist_node node;
+ struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
+{
+ return (struct llist_head *)&folio->_hugetlb_hwpoison;
+}
+
+/*
+ * Check if a given raw @subpage in a hugepage @folio is HWPOISON.
+ */
+bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66032f0d515c..41a283bd41a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3671,6 +3671,7 @@ extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
+extern bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
@@ -3685,6 +3686,12 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return 0;
}
+static inline bool __is_raw_hwp_subpage(struct folio *folio,
+ struct page *subpage)
+{
+ return false;
+}
+
static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ea24718db4af..6b860de87590 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7377,6 +7377,16 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
+bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
+{
+ bool ret;
+
+ spin_lock_irq(&hugetlb_lock);
+ ret = __is_raw_hwp_subpage(folio, subpage);
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
void folio_putback_active_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c415c3c462a3..891248e2930e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1809,18 +1809,32 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
#endif /* CONFIG_FS_DAX */
#ifdef CONFIG_HUGETLB_PAGE
-/*
- * Struct raw_hwp_page represents information about "raw error page",
- * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
- */
-struct raw_hwp_page {
- struct llist_node node;
- struct page *page;
-};
-static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
+bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
{
- return (struct llist_head *)&folio->_hugetlb_hwpoison;
+ struct llist_head *raw_hwp_head;
+ struct raw_hwp_page *p, *tmp;
+ bool ret = false;
+
+ if (!folio_test_hwpoison(folio))
+ return false;
+
+ /*
+ * When RawHwpUnreliable is set, kernel lost track of which subpages
+ * are HWPOISON. So return as if ALL subpages are HWPOISONed.
+ */
+ if (folio_test_hugetlb_raw_hwp_unreliable(folio))
+ return true;
+
+ raw_hwp_head = raw_hwp_list_head(folio);
+ llist_for_each_entry_safe(p, tmp, raw_hwp_head->first, node) {
+ if (subpage == p->page) {
+ ret = true;
+ break;
+ }
+ }
+
+ return ret;
}
static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
--
2.41.0.162.gfafddb0af9-goog
Add tests for the improvement made to read operation on HWPOISON
hugetlb page with different read granularities. For each chunk size,
three read scenarios are tested:
1. Simple regression test on read without HWPOISON.
2. Sequential read page by page should succeed until encounters the 1st
raw HWPOISON subpage.
3. After skip raw HWPOISON subpage by lseek, read()s always succeed.
Signed-off-by: Jiaqi Yan <[email protected]>
---
tools/testing/selftests/mm/.gitignore | 1 +
tools/testing/selftests/mm/Makefile | 1 +
.../selftests/mm/hugetlb-read-hwpoison.c | 322 ++++++++++++++++++
3 files changed, 324 insertions(+)
create mode 100644 tools/testing/selftests/mm/hugetlb-read-hwpoison.c
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
index 5599cf287694..37419296bf79 100644
--- a/tools/testing/selftests/mm/.gitignore
+++ b/tools/testing/selftests/mm/.gitignore
@@ -5,6 +5,7 @@ hugepage-mremap
hugepage-shm
hugepage-vmemmap
hugetlb-madvise
+hugetlb-read-hwpoison
khugepaged
map_hugetlb
map_populate
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 95acb099315e..63fcc7e3e9f0 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -38,6 +38,7 @@ TEST_GEN_PROGS += gup_longterm
TEST_GEN_PROGS += gup_test
TEST_GEN_PROGS += hmm-tests
TEST_GEN_PROGS += hugetlb-madvise
+TEST_GEN_PROGS += hugetlb-read-hwpoison
TEST_GEN_PROGS += hugepage-mmap
TEST_GEN_PROGS += hugepage-mremap
TEST_GEN_PROGS += hugepage-shm
diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
new file mode 100644
index 000000000000..ba6cc6f9cabc
--- /dev/null
+++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <linux/magic.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define PREFIX " ... "
+#define ERROR_PREFIX " !!! "
+
+#define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16)
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+enum test_status {
+ TEST_PASSED = 0,
+ TEST_FAILED = 1,
+ TEST_SKIPPED = 2,
+};
+
+static char *status_to_str(enum test_status status)
+{
+ switch (status) {
+ case TEST_PASSED:
+ return "TEST_PASSED";
+ case TEST_FAILED:
+ return "TEST_FAILED";
+ case TEST_SKIPPED:
+ return "TEST_SKIPPED";
+ default:
+ return "TEST_???";
+ }
+}
+
+static int setup_filemap(char *filemap, size_t len, size_t wr_chunk_size)
+{
+ char iter = 0;
+
+ for (size_t offset = 0; offset < len;
+ offset += wr_chunk_size) {
+ iter++;
+ memset(filemap + offset, iter, wr_chunk_size);
+ }
+
+ return 0;
+}
+
+static bool verify_chunk(char *buf, size_t len, char val)
+{
+ size_t i;
+
+ for (i = 0; i < len; ++i) {
+ if (buf[i] != val) {
+ printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n",
+ i, buf[i], val);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size,
+ off_t offset, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = offset / wr_chunk_size + offset % wr_chunk_size;
+
+ printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset);
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ if (lseek(fd, offset, SEEK_SET) < 0) {
+ perror(PREFIX ERROR_PREFIX "seek failed");
+ return false;
+ }
+
+ while (offset + total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static bool read_hugepage_filemap(int fd, size_t len,
+ size_t wr_chunk_size, size_t expected)
+{
+ char buf[MAX_WRITE_READ_CHUNK_SIZE];
+ ssize_t ret_count = 0;
+ ssize_t total_ret_count = 0;
+ char val = 0;
+
+ printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n",
+ expected);
+ while (total_ret_count < len) {
+ ret_count = read(fd, buf, wr_chunk_size);
+ if (ret_count == 0) {
+ printf(PREFIX PREFIX "read reach end of the file\n");
+ break;
+ } else if (ret_count < 0) {
+ perror(PREFIX ERROR_PREFIX "read failed");
+ break;
+ }
+ ++val;
+ if (!verify_chunk(buf, ret_count, val))
+ return false;
+
+ total_ret_count += ret_count;
+ }
+ printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n",
+ total_ret_count);
+
+ return total_ret_count == expected;
+}
+
+static enum test_status
+test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ if (read_hugepage_filemap(fd, len, wr_chunk_size, len))
+ status = TEST_PASSED;
+
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static enum test_status
+test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size,
+ bool skip_hwpoison_page)
+{
+ enum test_status status = TEST_SKIPPED;
+ char *filemap = NULL;
+ char *hwp_addr = NULL;
+ const unsigned long pagesize = getpagesize();
+
+ if (ftruncate(fd, len) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate failed");
+ return status;
+ }
+
+ filemap = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (filemap == MAP_FAILED) {
+ perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed");
+ goto done;
+ }
+
+ setup_filemap(filemap, len, wr_chunk_size);
+ status = TEST_FAILED;
+
+ /*
+ * Poisoned hugetlb page layout (assume hugepagesize=2MB):
+ * |<---------------------- 1MB ---------------------->|
+ * |<---- healthy page ---->|<---- HWPOISON page ----->|
+ * |<------------------- (1MB - 8KB) ----------------->|
+ */
+ hwp_addr = filemap + len / 2 + pagesize;
+ if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) {
+ perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed");
+ goto unmap;
+ }
+
+ if (!skip_hwpoison_page) {
+ /*
+ * Userspace should be able to read (1MB + 1 page) from
+ * the beginning of the HWPOISONed hugepage.
+ */
+ if (read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + pagesize))
+ status = TEST_PASSED;
+ } else {
+ /*
+ * Userspace should be able to read (1MB - 2 pages) from
+ * HWPOISONed hugepage.
+ */
+ if (seek_read_hugepage_filemap(fd, len, wr_chunk_size,
+ len / 2 + MAX(2 * pagesize, wr_chunk_size),
+ len / 2 - MAX(2 * pagesize, wr_chunk_size)))
+ status = TEST_PASSED;
+ }
+
+unmap:
+ munmap(filemap, len);
+done:
+ if (ftruncate(fd, 0) < 0) {
+ perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed");
+ status = TEST_FAILED;
+ }
+
+ return status;
+}
+
+static int create_hugetlbfs_file(struct statfs *file_stat)
+{
+ int fd;
+
+ fd = memfd_create("hugetlb_tmp", MFD_HUGETLB);
+ if (fd < 0) {
+ perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file");
+ return -1;
+ }
+
+ memset(file_stat, 0, sizeof(*file_stat));
+ if (fstatfs(fd, file_stat)) {
+ perror(PREFIX ERROR_PREFIX "fstatfs failed");
+ goto close;
+ }
+ if (file_stat->f_type != HUGETLBFS_MAGIC) {
+ printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n");
+ goto close;
+ }
+
+ return fd;
+close:
+ close(fd);
+ return -1;
+}
+
+int main(void)
+{
+ int fd;
+ struct statfs file_stat;
+ enum test_status status;
+ /* Test read() in different granularity. */
+ size_t wr_chunk_sizes[] = {
+ getpagesize() / 2, getpagesize(),
+ getpagesize() * 2, getpagesize() * 4
+ };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) {
+ printf("Write/read chunk size=0x%lx\n",
+ wr_chunk_sizes[i]);
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read regression test...\n");
+ status = test_hugetlb_read(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i]);
+ printf(PREFIX "HugeTLB read regression test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], false);
+ printf(PREFIX "HugeTLB read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+
+ fd = create_hugetlbfs_file(&file_stat);
+ if (fd < 0)
+ goto create_failure;
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...\n");
+ status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize,
+ wr_chunk_sizes[i], true);
+ printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n",
+ status_to_str(status));
+ close(fd);
+ if (status == TEST_FAILED)
+ return -1;
+ }
+
+ return 0;
+
+create_failure:
+ printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n");
+ return -1;
+}
--
2.41.0.162.gfafddb0af9-goog
When a hugepage contains HWPOISON pages, read() fails to read any byte
of the hugepage and returns -EIO, although many bytes in the HWPOISON
hugepage are readable.
Improve this by allowing hugetlbfs_read_iter returns as many bytes as
possible. For a requested range [offset, offset + len) that contains
HWPOISON page, return [offset, first HWPOISON page addr); the next read
attempt will fail and return -EIO.
Signed-off-by: Jiaqi Yan <[email protected]>
---
fs/hugetlbfs/inode.c | 58 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 52 insertions(+), 6 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 90361a922cec..86879ca3ff1e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -282,6 +282,42 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
#endif
+/*
+ * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
+ * Returns the maximum number of bytes one can read without touching the 1st raw
+ * HWPOISON subpage.
+ *
+ * The implementation borrows the iteration logic from copy_page_to_iter*.
+ */
+static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
+{
+ size_t n = 0;
+ size_t res = 0;
+ struct folio *folio = page_folio(page);
+
+ /* First subpage to start the loop. */
+ page += offset / PAGE_SIZE;
+ offset %= PAGE_SIZE;
+ while (1) {
+ if (is_raw_hwp_subpage(folio, page))
+ break;
+
+ /* Safe to read n bytes without touching HWPOISON subpage. */
+ n = min(bytes, (size_t)PAGE_SIZE - offset);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+
+ return res;
+}
+
/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. This provides functionality similar to filemap_read().
@@ -300,7 +336,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
while (iov_iter_count(to)) {
struct page *page;
- size_t nr, copied;
+ size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -328,16 +364,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
- if (PageHWPoison(page)) {
- put_page(page);
- retval = -EIO;
- break;
+ if (!PageHWPoison(page))
+ want = nr;
+ else {
+ /*
+ * Adjust how many bytes safe to read without
+ * touching the 1st raw HWPOISON subpage after
+ * offset.
+ */
+ want = adjust_range_hwpoison(page, offset, nr);
+ if (want == 0) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
}
/*
* We have the page, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, nr, to);
+ copied = copy_page_to_iter(page, offset, want, to);
put_page(page);
}
offset += copied;
--
2.41.0.162.gfafddb0af9-goog
On 06/23/23 16:40, Jiaqi Yan wrote:
> Adds the functionality to tell if a subpage of a hugetlb folio is a
> raw HWPOISON page. This functionality relies on RawHwpUnreliable to
> be not set; otherwise hugepage's HWPOISON list becomes meaningless.
>
> Exports this functionality to be immediately used in the read operation
> for hugetlbfs.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
> ---
> include/linux/hugetlb.h | 19 +++++++++++++++++++
> include/linux/mm.h | 7 +++++++
> mm/hugetlb.c | 10 ++++++++++
> mm/memory-failure.c | 34 ++++++++++++++++++++++++----------
> 4 files changed, 60 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 21f942025fec..8b73a12b7b38 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -1013,6 +1013,25 @@ void hugetlb_register_node(struct node *node);
> void hugetlb_unregister_node(struct node *node);
> #endif
>
> +/*
> + * Struct raw_hwp_page represents information about "raw error page",
> + * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
> + */
> +struct raw_hwp_page {
> + struct llist_node node;
> + struct page *page;
> +};
> +
> +static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
> +{
> + return (struct llist_head *)&folio->_hugetlb_hwpoison;
> +}
> +
> +/*
> + * Check if a given raw @subpage in a hugepage @folio is HWPOISON.
> + */
> +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
> +
> #else /* CONFIG_HUGETLB_PAGE */
> struct hstate {};
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 66032f0d515c..41a283bd41a7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3671,6 +3671,7 @@ extern const struct attribute_group memory_failure_attr_group;
> extern void memory_failure_queue(unsigned long pfn, int flags);
> extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> bool *migratable_cleared);
> +extern bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
> void num_poisoned_pages_inc(unsigned long pfn);
> void num_poisoned_pages_sub(unsigned long pfn, long i);
> struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
> @@ -3685,6 +3686,12 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> return 0;
> }
>
> +static inline bool __is_raw_hwp_subpage(struct folio *folio,
> + struct page *subpage)
> +{
> + return false;
> +}
> +
> static inline void num_poisoned_pages_inc(unsigned long pfn)
> {
> }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ea24718db4af..6b860de87590 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7377,6 +7377,16 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> return ret;
> }
>
> +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> +{
> + bool ret;
> +
> + spin_lock_irq(&hugetlb_lock);
> + ret = __is_raw_hwp_subpage(folio, subpage);
> + spin_unlock_irq(&hugetlb_lock);
Can you describe what races the hugetlb_lock prevents here?
--
Mike Kravetz
> + return ret;
> +}
> +
> void folio_putback_active_hugetlb(struct folio *folio)
> {
> spin_lock_irq(&hugetlb_lock);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index c415c3c462a3..891248e2930e 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1809,18 +1809,32 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
> #endif /* CONFIG_FS_DAX */
>
> #ifdef CONFIG_HUGETLB_PAGE
> -/*
> - * Struct raw_hwp_page represents information about "raw error page",
> - * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
> - */
> -struct raw_hwp_page {
> - struct llist_node node;
> - struct page *page;
> -};
>
> -static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
> +bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> {
> - return (struct llist_head *)&folio->_hugetlb_hwpoison;
> + struct llist_head *raw_hwp_head;
> + struct raw_hwp_page *p, *tmp;
> + bool ret = false;
> +
> + if (!folio_test_hwpoison(folio))
> + return false;
> +
> + /*
> + * When RawHwpUnreliable is set, kernel lost track of which subpages
> + * are HWPOISON. So return as if ALL subpages are HWPOISONed.
> + */
> + if (folio_test_hugetlb_raw_hwp_unreliable(folio))
> + return true;
> +
> + raw_hwp_head = raw_hwp_list_head(folio);
> + llist_for_each_entry_safe(p, tmp, raw_hwp_head->first, node) {
> + if (subpage == p->page) {
> + ret = true;
> + break;
> + }
> + }
> +
> + return ret;
> }
>
> static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> --
> 2.41.0.162.gfafddb0af9-goog
>
On Wed, Jul 5, 2023 at 4:57 PM Mike Kravetz <[email protected]> wrote:
>
> On 06/23/23 16:40, Jiaqi Yan wrote:
> > Adds the functionality to tell if a subpage of a hugetlb folio is a
> > raw HWPOISON page. This functionality relies on RawHwpUnreliable to
> > be not set; otherwise hugepage's HWPOISON list becomes meaningless.
> >
> > Exports this functionality to be immediately used in the read operation
> > for hugetlbfs.
> >
> > Signed-off-by: Jiaqi Yan <[email protected]>
> > ---
> > include/linux/hugetlb.h | 19 +++++++++++++++++++
> > include/linux/mm.h | 7 +++++++
> > mm/hugetlb.c | 10 ++++++++++
> > mm/memory-failure.c | 34 ++++++++++++++++++++++++----------
> > 4 files changed, 60 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 21f942025fec..8b73a12b7b38 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -1013,6 +1013,25 @@ void hugetlb_register_node(struct node *node);
> > void hugetlb_unregister_node(struct node *node);
> > #endif
> >
> > +/*
> > + * Struct raw_hwp_page represents information about "raw error page",
> > + * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
> > + */
> > +struct raw_hwp_page {
> > + struct llist_node node;
> > + struct page *page;
> > +};
> > +
> > +static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
> > +{
> > + return (struct llist_head *)&folio->_hugetlb_hwpoison;
> > +}
> > +
> > +/*
> > + * Check if a given raw @subpage in a hugepage @folio is HWPOISON.
> > + */
> > +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
> > +
> > #else /* CONFIG_HUGETLB_PAGE */
> > struct hstate {};
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 66032f0d515c..41a283bd41a7 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3671,6 +3671,7 @@ extern const struct attribute_group memory_failure_attr_group;
> > extern void memory_failure_queue(unsigned long pfn, int flags);
> > extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > bool *migratable_cleared);
> > +extern bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage);
> > void num_poisoned_pages_inc(unsigned long pfn);
> > void num_poisoned_pages_sub(unsigned long pfn, long i);
> > struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
> > @@ -3685,6 +3686,12 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > return 0;
> > }
> >
> > +static inline bool __is_raw_hwp_subpage(struct folio *folio,
> > + struct page *subpage)
> > +{
> > + return false;
> > +}
> > +
> > static inline void num_poisoned_pages_inc(unsigned long pfn)
> > {
> > }
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index ea24718db4af..6b860de87590 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -7377,6 +7377,16 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
> > return ret;
> > }
> >
> > +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> > +{
> > + bool ret;
> > +
> > + spin_lock_irq(&hugetlb_lock);
> > + ret = __is_raw_hwp_subpage(folio, subpage);
> > + spin_unlock_irq(&hugetlb_lock);
>
> Can you describe what races the hugetlb_lock prevents here?
I think we should sync here with __get_huge_page_for_hwpoison, who
iterates and inserts an entry to raw_hwp_list. llist itself doesn't
ensure insertion is synchronized with iterating from
__is_raw_hwp_subpage.
> --
> Mike Kravetz
>
> > + return ret;
> > +}
> > +
> > void folio_putback_active_hugetlb(struct folio *folio)
> > {
> > spin_lock_irq(&hugetlb_lock);
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index c415c3c462a3..891248e2930e 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -1809,18 +1809,32 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
> > #endif /* CONFIG_FS_DAX */
> >
> > #ifdef CONFIG_HUGETLB_PAGE
> > -/*
> > - * Struct raw_hwp_page represents information about "raw error page",
> > - * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
> > - */
> > -struct raw_hwp_page {
> > - struct llist_node node;
> > - struct page *page;
> > -};
> >
> > -static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
> > +bool __is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> > {
> > - return (struct llist_head *)&folio->_hugetlb_hwpoison;
> > + struct llist_head *raw_hwp_head;
> > + struct raw_hwp_page *p, *tmp;
> > + bool ret = false;
> > +
> > + if (!folio_test_hwpoison(folio))
> > + return false;
> > +
> > + /*
> > + * When RawHwpUnreliable is set, kernel lost track of which subpages
> > + * are HWPOISON. So return as if ALL subpages are HWPOISONed.
> > + */
> > + if (folio_test_hugetlb_raw_hwp_unreliable(folio))
> > + return true;
> > +
> > + raw_hwp_head = raw_hwp_list_head(folio);
> > + llist_for_each_entry_safe(p, tmp, raw_hwp_head->first, node) {
> > + if (subpage == p->page) {
> > + ret = true;
> > + break;
> > + }
> > + }
> > +
> > + return ret;
> > }
> >
> > static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
> > --
> > 2.41.0.162.gfafddb0af9-goog
> >
On 07/06/23 11:25, Jiaqi Yan wrote:
> On Wed, Jul 5, 2023 at 4:57 PM Mike Kravetz <[email protected]> wrote:
> > On 06/23/23 16:40, Jiaqi Yan wrote:
> > >
> > > +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> > > +{
> > > + bool ret;
> > > +
> > > + spin_lock_irq(&hugetlb_lock);
> > > + ret = __is_raw_hwp_subpage(folio, subpage);
> > > + spin_unlock_irq(&hugetlb_lock);
> >
> > Can you describe what races the hugetlb_lock prevents here?
>
> I think we should sync here with __get_huge_page_for_hwpoison, who
> iterates and inserts an entry to raw_hwp_list. llist itself doesn't
> ensure insertion is synchronized with iterating from
> __is_raw_hwp_subpage.
>
Ok, makes sense. And, since this is only called in the file read patch
when we encounter a PageHWPoison(page), the overhead of the lock cycles
is not of concern.
You can add,
Reviewed-by: Mike Kravetz <[email protected]>
--
Mike Kravetz
On 06/23/23 16:40, Jiaqi Yan wrote:
> When a hugepage contains HWPOISON pages, read() fails to read any byte
> of the hugepage and returns -EIO, although many bytes in the HWPOISON
> hugepage are readable.
>
> Improve this by allowing hugetlbfs_read_iter returns as many bytes as
> possible. For a requested range [offset, offset + len) that contains
> HWPOISON page, return [offset, first HWPOISON page addr); the next read
> attempt will fail and return -EIO.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
> ---
> fs/hugetlbfs/inode.c | 58 +++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 52 insertions(+), 6 deletions(-)
I went through the code and could not find any problems.
Reviewed-by: Mike Kravetz <[email protected]>
However, code like this is where I often make mistakes. So, it would be
great if someone else could take a look as well.
--
Mike Kravetz
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 90361a922cec..86879ca3ff1e 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -282,6 +282,42 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> }
> #endif
>
> +/*
> + * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
> + * Returns the maximum number of bytes one can read without touching the 1st raw
> + * HWPOISON subpage.
> + *
> + * The implementation borrows the iteration logic from copy_page_to_iter*.
> + */
> +static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
> +{
> + size_t n = 0;
> + size_t res = 0;
> + struct folio *folio = page_folio(page);
> +
> + /* First subpage to start the loop. */
> + page += offset / PAGE_SIZE;
> + offset %= PAGE_SIZE;
> + while (1) {
> + if (is_raw_hwp_subpage(folio, page))
> + break;
> +
> + /* Safe to read n bytes without touching HWPOISON subpage. */
> + n = min(bytes, (size_t)PAGE_SIZE - offset);
> + res += n;
> + bytes -= n;
> + if (!bytes || !n)
> + break;
> + offset += n;
> + if (offset == PAGE_SIZE) {
> + page++;
> + offset = 0;
> + }
> + }
> +
> + return res;
> +}
> +
> /*
> * Support for read() - Find the page attached to f_mapping and copy out the
> * data. This provides functionality similar to filemap_read().
> @@ -300,7 +336,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
> while (iov_iter_count(to)) {
> struct page *page;
> - size_t nr, copied;
> + size_t nr, copied, want;
>
> /* nr is the maximum number of bytes to copy from this page */
> nr = huge_page_size(h);
> @@ -328,16 +364,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
> } else {
> unlock_page(page);
>
> - if (PageHWPoison(page)) {
> - put_page(page);
> - retval = -EIO;
> - break;
> + if (!PageHWPoison(page))
> + want = nr;
> + else {
> + /*
> + * Adjust how many bytes safe to read without
> + * touching the 1st raw HWPOISON subpage after
> + * offset.
> + */
> + want = adjust_range_hwpoison(page, offset, nr);
> + if (want == 0) {
> + put_page(page);
> + retval = -EIO;
> + break;
> + }
> }
>
> /*
> * We have the page, copy it to user space buffer.
> */
> - copied = copy_page_to_iter(page, offset, nr, to);
> + copied = copy_page_to_iter(page, offset, want, to);
> put_page(page);
> }
> offset += copied;
> --
> 2.41.0.162.gfafddb0af9-goog
>
On 06/23/23 16:40, Jiaqi Yan wrote:
> Add tests for the improvement made to read operation on HWPOISON
> hugetlb page with different read granularities. For each chunk size,
> three read scenarios are tested:
> 1. Simple regression test on read without HWPOISON.
> 2. Sequential read page by page should succeed until encounters the 1st
> raw HWPOISON subpage.
> 3. After skip raw HWPOISON subpage by lseek, read()s always succeed.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
> ---
> tools/testing/selftests/mm/.gitignore | 1 +
> tools/testing/selftests/mm/Makefile | 1 +
> .../selftests/mm/hugetlb-read-hwpoison.c | 322 ++++++++++++++++++
> 3 files changed, 324 insertions(+)
> create mode 100644 tools/testing/selftests/mm/hugetlb-read-hwpoison.c
Thank you for adding a selftest!
Acked-by: Mike Kravetz <[email protected]>
--
Mike Kravetz
On Fri, Jun 23, 2023 at 04:40:14PM +0000, Jiaqi Yan wrote:
> When a hugepage contains HWPOISON pages, read() fails to read any byte
> of the hugepage and returns -EIO, although many bytes in the HWPOISON
> hugepage are readable.
>
> Improve this by allowing hugetlbfs_read_iter returns as many bytes as
> possible. For a requested range [offset, offset + len) that contains
> HWPOISON page, return [offset, first HWPOISON page addr); the next read
> attempt will fail and return -EIO.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
Looks good to me.
Reviewed-by: Naoya Horiguchi <[email protected]>
On Fri, Jun 23, 2023 at 04:40:13PM +0000, Jiaqi Yan wrote:
> Adds the functionality to tell if a subpage of a hugetlb folio is a
> raw HWPOISON page. This functionality relies on RawHwpUnreliable to
> be not set; otherwise hugepage's HWPOISON list becomes meaningless.
>
> Exports this functionality to be immediately used in the read operation
> for hugetlbfs.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
Looks good to me, thank you.
Acked-by: Naoya Horiguchi <[email protected]>
On Fri, Jun 23, 2023 at 04:40:15PM +0000, Jiaqi Yan wrote:
> Add tests for the improvement made to read operation on HWPOISON
> hugetlb page with different read granularities. For each chunk size,
> three read scenarios are tested:
> 1. Simple regression test on read without HWPOISON.
> 2. Sequential read page by page should succeed until encounters the 1st
> raw HWPOISON subpage.
> 3. After skip raw HWPOISON subpage by lseek, read()s always succeed.
>
> Signed-off-by: Jiaqi Yan <[email protected]>
Reviewed-by: Naoya Horiguchi <[email protected]>
On Thu, Jul 6, 2023 at 3:06 PM Mike Kravetz <[email protected]> wrote:
>
> On 07/06/23 11:25, Jiaqi Yan wrote:
> > On Wed, Jul 5, 2023 at 4:57 PM Mike Kravetz <[email protected]> wrote:
> > > On 06/23/23 16:40, Jiaqi Yan wrote:
> > > >
> > > > +bool is_raw_hwp_subpage(struct folio *folio, struct page *subpage)
> > > > +{
> > > > + bool ret;
> > > > +
> > > > + spin_lock_irq(&hugetlb_lock);
> > > > + ret = __is_raw_hwp_subpage(folio, subpage);
> > > > + spin_unlock_irq(&hugetlb_lock);
> > >
> > > Can you describe what races the hugetlb_lock prevents here?
> >
> > I think we should sync here with __get_huge_page_for_hwpoison, who
> > iterates and inserts an entry to raw_hwp_list. llist itself doesn't
> > ensure insertion is synchronized with iterating from
> > __is_raw_hwp_subpage.
> >
>
> Ok, makes sense. And, since this is only called in the file read patch
> when we encounter a PageHWPoison(page), the overhead of the lock cycles
> is not of concern.
Yes, thanks for pointing this out, (which I forgot), as
is_raw_hwp_subpage() is after PageHWPoison check in
hugetlbfs_read_iter. I think both this and the reason why holding the
lock is worth mentioning in the commit msg.
>
> You can add,
>
> Reviewed-by: Mike Kravetz <[email protected]>
> --
> Mike Kravetz