In v3, dax recovery code path is independent of that of
normal write. Competing dax recovery threads are serialized,
racing read threads are guaranteed not overlapping with the
recovery process.
In this phase, the recovery granularity is page, future patch
will explore recovery in finer granularity.
Please refer to below discussions for more information:
v2:
https://lore.kernel.org/all/[email protected]/
Disussions about marking poisoned page as 'np':
https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/
Jane Chu (7):
mce: fix set_mce_nospec to always unmap the whole page
dax: introduce dax device flag DAXDEV_RECOVERY
dm: make dm aware of target's DAXDEV_RECOVERY capability
dax: add dax_recovery_write to dax_op and dm target type
pmem: add pmem_recovery_write() dax op
dax: add recovery_write to dax_iomap_iter in failure path
pmem: fix pmem_do_write() avoid writing to 'np' page
arch/x86/include/asm/set_memory.h | 17 ++---
arch/x86/kernel/cpu/mce/core.c | 6 +-
arch/x86/mm/pat/set_memory.c | 8 ++-
drivers/dax/super.c | 53 ++++++++++++++++
drivers/md/dm-linear.c | 13 ++++
drivers/md/dm-log-writes.c | 14 +++++
drivers/md/dm-stripe.c | 13 ++++
drivers/md/dm-table.c | 33 ++++++++++
drivers/md/dm.c | 27 ++++++++
drivers/nvdimm/pmem.c | 101 +++++++++++++++++++++++++++---
drivers/nvdimm/pmem.h | 1 +
fs/dax.c | 25 +++++++-
include/linux/dax.h | 9 +++
include/linux/device-mapper.h | 3 +
include/linux/set_memory.h | 2 +-
15 files changed, 298 insertions(+), 27 deletions(-)
--
2.18.4
Since poisoned page is marked as not-present, the first of the
two back-to-back write_pmem() calls can only be made when there
is no poison in the range, otherwise kernel Oops.
Signed-off-by: Jane Chu <[email protected]>
---
drivers/nvdimm/pmem.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index dd2db4905c85..6e395014da5e 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -187,10 +187,15 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem,
* after clear poison.
*/
flush_dcache_page(page);
- write_pmem(pmem_addr, page, page_off, len);
- if (unlikely(bad_pmem)) {
- rc = pmem_clear_poison(pmem, pmem_off, len);
+ if (!bad_pmem) {
write_pmem(pmem_addr, page, page_off, len);
+ } else {
+ rc = pmem_clear_poison(pmem, pmem_off, len);
+ if (rc == BLK_STS_OK)
+ write_pmem(pmem_addr, page, page_off, len);
+ else
+ pr_warn("%s: failed to clear poison\n",
+ __func__);
}
return rc;
--
2.18.4
dax_iomap_iter() fails if the destination range contains poison.
Add recovery_write to the failure code path.
Signed-off-by: Jane Chu <[email protected]>
---
fs/dax.c | 25 +++++++++++++++++++++++--
1 file changed, 23 insertions(+), 2 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index e0eecd8e3a8f..c16362d3993c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1199,6 +1199,8 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
+typedef size_t (*iter_func_t)(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i);
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
@@ -1210,6 +1212,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
ssize_t ret = 0;
size_t xfer;
int id;
+ iter_func_t write_func = dax_copy_from_iter;
if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(iomi->inode));
@@ -1249,6 +1252,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, NULL);
+ if ((map_len == -EIO) && (iov_iter_rw(iter) == WRITE)) {
+ if (dax_prep_recovery(dax_dev, &kaddr) < 0) {
+ ret = map_len;
+ break;
+ }
+ map_len = dax_direct_access(dax_dev, pgoff,
+ PHYS_PFN(size), &kaddr, NULL);
+ if (map_len > 0)
+ write_func = dax_recovery_write;
+ }
+
if (map_len < 0) {
ret = map_len;
break;
@@ -1264,14 +1278,21 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* The userspace address for the memory copy has already been
* validated via access_ok() in either vfs_read() or
* vfs_write(), depending on which operation we are doing.
+ * xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ * map_len, iter);
*/
if (iov_iter_rw(iter) == WRITE)
- xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
- map_len, iter);
+ xfer = write_func(dax_dev, pgoff, kaddr, map_len, iter);
else
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
+ if (xfer == (ssize_t) -EIO) {
+ pr_warn("dax_ioma_iter: write_func returns-EIO\n");
+ ret = -EIO;
+ break;
+ }
+
pos += xfer;
length -= xfer;
done += xfer;
--
2.18.4
On Tue, Jan 11, 2022 at 11:59:23AM -0700, Jane Chu wrote:
> In v3, dax recovery code path is independent of that of
> normal write. Competing dax recovery threads are serialized,
> racing read threads are guaranteed not overlapping with the
> recovery process.
>
> In this phase, the recovery granularity is page, future patch
> will explore recovery in finer granularity.
What tree is this against? I can't apply it to either 5.16 or Linus'
current tree.
On 1/20/2022 1:55 AM, Christoph Hellwig wrote:
> On Tue, Jan 11, 2022 at 11:59:23AM -0700, Jane Chu wrote:
>> In v3, dax recovery code path is independent of that of
>> normal write. Competing dax recovery threads are serialized,
>> racing read threads are guaranteed not overlapping with the
>> recovery process.
>>
>> In this phase, the recovery granularity is page, future patch
>> will explore recovery in finer granularity.
>
> What tree is this against? I can't apply it to either 5.16 or Linus'
> current tree.
It was based on your 'dax-block-cleanup' branch a while back.
thanks,
-jane
On Fri, Jan 21, 2022 at 01:33:40AM +0000, Jane Chu wrote:
> > What tree is this against? I can't apply it to either 5.16 or Linus'
> > current tree.
>
> It was based on your 'dax-block-cleanup' branch a while back.
Do you have a git tree with your patches included available somewhere?
On 1/24/2022 1:01 AM, Christoph Hellwig wrote:
> On Fri, Jan 21, 2022 at 01:33:40AM +0000, Jane Chu wrote:
>>> What tree is this against? I can't apply it to either 5.16 or Linus'
>>> current tree.
>>
>> It was based on your 'dax-block-cleanup' branch a while back.
>
> Do you have a git tree with your patches included available somewhere?
Sorry I don't have a git tree, so I rebased the series to
v5.17-rc1-81-g0280e3c58f9, hope that helps.
-jane