2022-01-31 11:40:28

by Jane Chu

[permalink] [raw]
Subject: [PATCH v5 0/7] DAX poison recovery

In this series, dax recovery code path is independent of that of
normal write. Competing dax recovery threads are serialized,
racing read threads are guaranteed not overlapping with the
recovery process.

In this phase, the recovery granularity is page, future patch
will explore recovery in finer granularity.

Change from v4:
Fixed build errors reported by kernel test robot
Change from v3:
Rebased to v5.17-rc1-81-g0280e3c58f92

v4:
https://lore.kernel.org/lkml/[email protected]/T/
v3:
https://lkml.org/lkml/2022/1/11/900
v2:
https://lore.kernel.org/all/[email protected]/
Disussions about marking poisoned page as 'np':
https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@mail.gmail.com/


Jane Chu (7):
mce: fix set_mce_nospec to always unmap the whole page
dax: introduce dax device flag DAXDEV_RECOVERY
dm: make dm aware of target's DAXDEV_RECOVERY capability
dax: add dax_recovery_write to dax_op and dm target type
pmem: add pmem_recovery_write() dax op
dax: add recovery_write to dax_iomap_iter in failure path
pmem: fix pmem_do_write() avoid writing to 'np' page

arch/x86/include/asm/set_memory.h | 17 ++----
arch/x86/kernel/cpu/mce/core.c | 6 +-
arch/x86/mm/pat/set_memory.c | 8 ++-
drivers/dax/super.c | 41 +++++++++++++
drivers/md/dm-linear.c | 12 ++++
drivers/md/dm-log-writes.c | 12 ++++
drivers/md/dm-stripe.c | 13 ++++
drivers/md/dm-table.c | 33 +++++++++++
drivers/md/dm.c | 27 +++++++++
drivers/nvdimm/pmem.c | 99 ++++++++++++++++++++++++++++---
drivers/nvdimm/pmem.h | 1 +
fs/dax.c | 23 ++++++-
include/linux/dax.h | 30 ++++++++++
include/linux/device-mapper.h | 9 +++
include/linux/set_memory.h | 2 +-
15 files changed, 306 insertions(+), 27 deletions(-)

--
2.18.4


2022-01-31 11:41:46

by Jane Chu

[permalink] [raw]
Subject: [PATCH v5 4/7] dax: add dax_recovery_write to dax_op and dm target type

dax_recovery_write() dax op is only required for DAX device that
export DAXDEV_RECOVERY indicating its capability to recover from
poisons.

DM may be nested, if part of the base dax devices forming a DM
device support dax recovery, the DM device is marked with such
capability.

Signed-off-by: Jane Chu <[email protected]>
---
drivers/dax/super.c | 17 +++++++++++++++++
drivers/md/dm-linear.c | 12 ++++++++++++
drivers/md/dm-log-writes.c | 12 ++++++++++++
drivers/md/dm-stripe.c | 13 +++++++++++++
drivers/md/dm.c | 27 +++++++++++++++++++++++++++
drivers/nvdimm/pmem.c | 7 +++++++
include/linux/dax.h | 16 ++++++++++++++++
include/linux/device-mapper.h | 9 +++++++++
8 files changed, 113 insertions(+)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index f4f607d9698b..a136fa6b3e36 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -195,6 +195,23 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
}
EXPORT_SYMBOL_GPL(dax_zero_page_range);

+size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ if (!dax_recovery_capable(dax_dev) || !dax_dev->ops->recovery_write)
+ return (size_t)-EOPNOTSUPP;
+ return dax_dev->ops->recovery_write(dax_dev, pgoff, addr, bytes, i);
+}
+EXPORT_SYMBOL_GPL(dax_recovery_write);
+
+bool dax_recovery_started(struct dax_device *dax_dev, void **kaddr)
+{
+ if (!kaddr || !dax_recovery_capable(dax_dev))
+ return false;
+ return test_bit(DAXDEV_RECOVERY, (unsigned long *)kaddr);
+}
+EXPORT_SYMBOL_GPL(dax_recovery_started);
+
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 1b97a11d7151..831c565bf024 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -188,9 +188,20 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

+static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
+
+ if (!dax_recovery_capable(dax_dev))
+ return (size_t) -EOPNOTSUPP;
+ return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
+};
+
#else
#define linear_dax_direct_access NULL
#define linear_dax_zero_page_range NULL
+#define linear_dax_recovery_write NULL
#endif

static struct target_type linear_target = {
@@ -208,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_zero_page_range = linear_dax_zero_page_range,
+ .dax_recovery_write = linear_dax_recovery_write,
};

int __init dm_linear_init(void)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 139b09b06eda..22c2d64ef963 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -927,9 +927,20 @@ static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
}

+static size_t log_writes_dax_recovery_write(struct dm_target *ti,
+ pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
+
+ if (!dax_recovery_capable(dax_dev))
+ return (size_t) -EOPNOTSUPP;
+ return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
+}
+
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_zero_page_range NULL
+#define log_writes_dax_recovery_write NULL
#endif

static struct target_type log_writes_target = {
@@ -947,6 +958,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_zero_page_range = log_writes_dax_zero_page_range,
+ .dax_recovery_write = log_writes_dax_recovery_write,
};

static int __init dm_log_writes_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index e566115ec0bb..78c52c8865ef 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -332,9 +332,21 @@ static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

+static size_t stripe_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);
+
+ if (!dax_recovery_capable(dax_dev))
+ return (size_t) -EOPNOTSUPP;
+
+ return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
+}
+
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_zero_page_range NULL
+#define stripe_dax_recovery_write NULL
#endif

/*
@@ -471,6 +483,7 @@ static struct target_type stripe_target = {
.io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access,
.dax_zero_page_range = stripe_dax_zero_page_range,
+ .dax_recovery_write = stripe_dax_recovery_write,
};

int __init dm_stripe_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c0ae8087c602..bdc142258ace 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1054,6 +1054,32 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}

+static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ sector_t sector = pgoff * PAGE_SECTORS;
+ struct dm_target *ti;
+ long ret = 0;
+ int srcu_idx;
+
+ ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+ if (!ti)
+ goto out;
+
+ if (!ti->type->dax_recovery_write) {
+ ret = (size_t)-EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
+out:
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
@@ -2980,6 +3006,7 @@ static const struct block_device_operations dm_rq_blk_dops = {
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
.zero_page_range = dm_dax_zero_page_range,
+ .recovery_write = dm_dax_recovery_write,
};

/*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e8823813a8df..638e64681db9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -301,9 +301,16 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}

+static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i)
+{
+ return 0;
+}
+
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
.zero_page_range = pmem_dax_zero_page_range,
+ .recovery_write = pmem_recovery_write,
};

static ssize_t write_cache_show(struct device *dev,
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2fc776653c6e..1b3d6ebf3e49 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -30,6 +30,9 @@ struct dax_operations {
sector_t, sector_t);
/* zero_page_range: required operation. Zero page range */
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
+ /* recovery_write: optional operation. */
+ size_t (*recovery_write)(struct dax_device *, pgoff_t, void *, size_t,
+ struct iov_iter *);
};

#if IS_ENABLED(CONFIG_DAX)
@@ -43,6 +46,9 @@ void set_dax_synchronous(struct dax_device *dax_dev);
void set_dax_recovery(struct dax_device *dax_dev);
bool dax_recovery_capable(struct dax_device *dax_dev);
int dax_prep_recovery(struct dax_device *dax_dev, void **kaddr);
+bool dax_recovery_started(struct dax_device *dax_dev, void **kaddr);
+size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
+ size_t bytes, struct iov_iter *i);
/*
* Check if given mapping is supported by the file / underlying device.
*/
@@ -101,6 +107,16 @@ static inline int dax_prep_recovery(struct dax_device *dax_dev, void **kaddr)
{
return -EINVAL;
}
+static inline bool dax_recovery_started(struct dax_device *dax_dev,
+ void **kaddr)
+{
+ return false;
+}
+static inline size_t dax_recovery_write(struct dax_device *dax_dev,
+ pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+{
+ return 0;
+}
#endif

void set_dax_nocache(struct dax_device *dax_dev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index b26fecf6c8e8..4f134ba63d3c 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -150,6 +150,14 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
size_t nr_pages);

+/*
+ * Returns:
+ * != 0 : number of bytes transferred, or size_t casted negative error code
+ * 0 : failure
+ */
+typedef size_t (*dm_dax_recovery_write_fn)(struct dm_target *ti, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i);
+
void dm_error(const char *message);

struct dm_dev {
@@ -199,6 +207,7 @@ struct target_type {
dm_io_hints_fn io_hints;
dm_dax_direct_access_fn direct_access;
dm_dax_zero_page_range_fn dax_zero_page_range;
+ dm_dax_recovery_write_fn dax_recovery_write;

/* For internal device-mapper use. */
struct list_head list;
--
2.18.4

2022-01-31 11:41:47

by Jane Chu

[permalink] [raw]
Subject: [PATCH v5 6/7] dax: add recovery_write to dax_iomap_iter in failure path

dax_iomap_iter() fails if the destination range contains poison.
Add recovery_write to the failure code path.

Signed-off-by: Jane Chu <[email protected]>
---
fs/dax.c | 23 +++++++++++++++++++++--
1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index cd03485867a7..236675bd5946 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1199,6 +1199,8 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);

+typedef size_t (*iter_func_t)(struct dax_device *dax_dev, pgoff_t pgoff,
+ void *addr, size_t bytes, struct iov_iter *i);
static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
@@ -1210,6 +1212,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
ssize_t ret = 0;
size_t xfer;
int id;
+ iter_func_t write_func = dax_copy_from_iter;

if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(iomi->inode));
@@ -1249,6 +1252,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,

map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, NULL);
+ if ((map_len == -EIO) && (iov_iter_rw(iter) == WRITE)) {
+ if (dax_prep_recovery(dax_dev, &kaddr) < 0) {
+ ret = map_len;
+ break;
+ }
+ map_len = dax_direct_access(dax_dev, pgoff,
+ PHYS_PFN(size), &kaddr, NULL);
+ if (map_len > 0)
+ write_func = dax_recovery_write;
+ }
+
if (map_len < 0) {
ret = map_len;
break;
@@ -1261,12 +1275,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
map_len = end - pos;

if (iov_iter_rw(iter) == WRITE)
- xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
- map_len, iter);
+ xfer = write_func(dax_dev, pgoff, kaddr, map_len, iter);
else
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);

+ if (xfer == (ssize_t) -EIO) {
+ pr_warn("dax_ioma_iter: write_func returns -EIO\n");
+ ret = -EIO;
+ break;
+ }
+
pos += xfer;
length -= xfer;
done += xfer;
--
2.18.4

2022-02-02 21:26:38

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v5 4/7] dax: add dax_recovery_write to dax_op and dm target type

On Fri, Jan 28, 2022 at 02:31:47PM -0700, Jane Chu wrote:
> dax_recovery_write() dax op is only required for DAX device that
> export DAXDEV_RECOVERY indicating its capability to recover from
> poisons.
>
> DM may be nested, if part of the base dax devices forming a DM
> device support dax recovery, the DM device is marked with such
> capability.

I'd fold this into the previous 2 patches as the flag and method
are clearly very tightly coupled.

> +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
> + void *addr, size_t bytes, struct iov_iter *i)

Function line continuations use two tab indentations or alignment after
the opening brace.

> +{
> + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
> +
> + if (!dax_recovery_capable(dax_dev))
> + return (size_t) -EOPNOTSUPP;

Returning a negativ errno through an unsigned argument looks dangerous.

> + /* recovery_write: optional operation. */

And explanation of what the method is use for might be more useful than
mentioning that is is optional.

> + size_t (*recovery_write)(struct dax_device *, pgoff_t, void *, size_t,
> + struct iov_iter *);

Spelling out the arguments tends to help readability, but then again
none of the existing methods does it.

2022-02-03 15:47:32

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v5 6/7] dax: add recovery_write to dax_iomap_iter in failure path

On Fri, Jan 28, 2022 at 02:31:49PM -0700, Jane Chu wrote:
> +typedef size_t (*iter_func_t)(struct dax_device *dax_dev, pgoff_t pgoff,
> + void *addr, size_t bytes, struct iov_iter *i);
> static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> struct iov_iter *iter)
> {
> @@ -1210,6 +1212,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> ssize_t ret = 0;
> size_t xfer;
> int id;
> + iter_func_t write_func = dax_copy_from_iter;

This use of a function pointer causes indirect call overhead. A simple
"bool in_recovery" or do_recovery does the trick in a way that is
both more readable and generates faster code.

> + if ((map_len == -EIO) && (iov_iter_rw(iter) == WRITE)) {

No need for the braces.

> if (iov_iter_rw(iter) == WRITE)
> - xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> - map_len, iter);
> + xfer = write_func(dax_dev, pgoff, kaddr, map_len, iter);
> else
> xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
> map_len, iter);

i.e.

if (iov_iter_rw(iter) == READ)
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else if (unlikely(do_recovery))
xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
map_len, iter);
else
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);

2022-02-04 14:50:50

by Jane Chu

[permalink] [raw]
Subject: Re: [PATCH v5 6/7] dax: add recovery_write to dax_iomap_iter in failure path

On 2/2/2022 5:44 AM, Christoph Hellwig wrote:
> On Fri, Jan 28, 2022 at 02:31:49PM -0700, Jane Chu wrote:
>> +typedef size_t (*iter_func_t)(struct dax_device *dax_dev, pgoff_t pgoff,
>> + void *addr, size_t bytes, struct iov_iter *i);
>> static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>> struct iov_iter *iter)
>> {
>> @@ -1210,6 +1212,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
>> ssize_t ret = 0;
>> size_t xfer;
>> int id;
>> + iter_func_t write_func = dax_copy_from_iter;
>
> This use of a function pointer causes indirect call overhead. A simple
> "bool in_recovery" or do_recovery does the trick in a way that is
> both more readable and generates faster code.

Good point, thanks!

>
>> + if ((map_len == -EIO) && (iov_iter_rw(iter) == WRITE)) {
>
> No need for the braces.

Did you mean the outer "( )" ?

>
>> if (iov_iter_rw(iter) == WRITE)
>> - xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
>> - map_len, iter);
>> + xfer = write_func(dax_dev, pgoff, kaddr, map_len, iter);
>> else
>> xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
>> map_len, iter);
>
> i.e.
>
> if (iov_iter_rw(iter) == READ)
> xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
> map_len, iter);
> else if (unlikely(do_recovery))
> xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
> map_len, iter);
> else
> xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
> map_len, iter);
>

Will do.

Thanks a lot!

-jane

2022-02-04 18:08:49

by Jane Chu

[permalink] [raw]
Subject: Re: [PATCH v5 4/7] dax: add dax_recovery_write to dax_op and dm target type

On 2/2/2022 5:34 AM, Christoph Hellwig wrote:
> On Fri, Jan 28, 2022 at 02:31:47PM -0700, Jane Chu wrote:
>> dax_recovery_write() dax op is only required for DAX device that
>> export DAXDEV_RECOVERY indicating its capability to recover from
>> poisons.
>>
>> DM may be nested, if part of the base dax devices forming a DM
>> device support dax recovery, the DM device is marked with such
>> capability.
>
> I'd fold this into the previous 2 patches as the flag and method
> are clearly very tightly coupled.

Make sense, will do.

>
>> +static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
>> + void *addr, size_t bytes, struct iov_iter *i)
>
> Function line continuations use two tab indentations or alignment after
> the opening brace.

Okay.

>
>> +{
>> + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
>> +
>> + if (!dax_recovery_capable(dax_dev))
>> + return (size_t) -EOPNOTSUPP;
>
> Returning a negativ errno through an unsigned argument looks dangerous.

Sorry, should be (ssize_t) there.

>
>> + /* recovery_write: optional operation. */
>
> And explanation of what the method is use for might be more useful than
> mentioning that is is optional.

Will add substance to comments.

>
>> + size_t (*recovery_write)(struct dax_device *, pgoff_t, void *, size_t,
>> + struct iov_iter *);
>
> Spelling out the arguments tends to help readability, but then again
> none of the existing methods does it.

Thanks!
-jane

2022-02-07 05:56:43

by Jane Chu

[permalink] [raw]
Subject: Re: [PATCH v5 4/7] dax: add dax_recovery_write to dax_op and dm target type

On 2/3/2022 10:03 PM, Dan Williams wrote:
> On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <[email protected]> wrote:
>>
>> dax_recovery_write() dax op is only required for DAX device that
>> export DAXDEV_RECOVERY indicating its capability to recover from
>> poisons.
>>
>> DM may be nested, if part of the base dax devices forming a DM
>> device support dax recovery, the DM device is marked with such
>> capability.
>>
>> Signed-off-by: Jane Chu <[email protected]>
> [..]
>> diff --git a/include/linux/dax.h b/include/linux/dax.h
>> index 2fc776653c6e..1b3d6ebf3e49 100644
>> --- a/include/linux/dax.h
>> +++ b/include/linux/dax.h
>> @@ -30,6 +30,9 @@ struct dax_operations {
>> sector_t, sector_t);
>> /* zero_page_range: required operation. Zero page range */
>> int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
>> + /* recovery_write: optional operation. */
>> + size_t (*recovery_write)(struct dax_device *, pgoff_t, void *, size_t,
>> + struct iov_iter *);
>
> The removal of the ->copy_{to,from}_iter() operations set the
> precedent that dax ops should not be needed when the operation can be
> carried out generically. The only need to call back to the pmem driver
> is so that it can call nvdimm_clear_poison(). nvdimm_clear_poison() in
> turn only needs the 'struct device' hosting the pmem and the physical
> address to be cleared. The physical address is already returned by
> dax_direct_access(). The device is something that could be added to
> dax_device, and the pgmap could host the callback that pmem fills in.
> Something like:
>
>
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index 58eda16f5c53..36486ba4753a 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -694,6 +694,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn
> *nd_pfn, struct dev_pagemap *pgmap)
> .end = nsio->res.end - end_trunc,
> };
> pgmap->nr_range = 1;
> + pgmap->owner = &nd_pfn->dev;
> if (nd_pfn->mode == PFN_MODE_RAM) {
> if (offset < reserve)
> return -EINVAL;
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 58d95242a836..95e1b6326f88 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -481,6 +481,7 @@ static int pmem_attach_disk(struct device *dev,
> }
> set_dax_nocache(dax_dev);
> set_dax_nomc(dax_dev);
> + set_dax_pgmap(dax_dev, &pmem->pgmap);
> if (is_nvdimm_sync(nd_region))
> set_dax_synchronous(dax_dev);
> rc = dax_add_host(dax_dev, disk);
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 1fafcc38acba..8cb59b5df38b 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -81,6 +81,11 @@ struct dev_pagemap_ops {
>
> #define PGMAP_ALTMAP_VALID (1 << 0)
>
> +struct dev_pagemap_operations {
> + size_t (*recovery_write)(struct dev_pagemap *pgmap, void *, size_t,
> + struct iov_iter *);
> +};
> +
> /**
> * struct dev_pagemap - metadata for ZONE_DEVICE mappings
> * @altmap: pre-allocated/reserved memory for vmemmap allocations
> @@ -111,12 +116,15 @@ struct dev_pagemap {
> const struct dev_pagemap_ops *ops;
> void *owner;
> int nr_range;
> + struct dev_pagemap_operations ops;
> union {
> struct range range;
> struct range ranges[0];
> };
> };
>
> ...then DM does not need to be involved in the recovery path, fs/dax.c
> just does dax_direct_access(..., DAX_RECOVERY, ...) and then looks up
> the pgmap to generically coordinate the recovery_write(). The pmem
> driver would be responsible for setting pgmap->recovery_write() to a
> function that calls nvdimm_clear_poison().
>
> This arch works for anything that can be described by a pgmap, and
> supports error clearing, it need not be limited to the pmem block
> driver.

This is an interesting idea, let me give it a try and get back to you.

Thanks!
-jane

2022-02-08 15:52:39

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH v5 4/7] dax: add dax_recovery_write to dax_op and dm target type

On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <[email protected]> wrote:
>
> dax_recovery_write() dax op is only required for DAX device that
> export DAXDEV_RECOVERY indicating its capability to recover from
> poisons.
>
> DM may be nested, if part of the base dax devices forming a DM
> device support dax recovery, the DM device is marked with such
> capability.
>
> Signed-off-by: Jane Chu <[email protected]>
[..]
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 2fc776653c6e..1b3d6ebf3e49 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -30,6 +30,9 @@ struct dax_operations {
> sector_t, sector_t);
> /* zero_page_range: required operation. Zero page range */
> int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
> + /* recovery_write: optional operation. */
> + size_t (*recovery_write)(struct dax_device *, pgoff_t, void *, size_t,
> + struct iov_iter *);

The removal of the ->copy_{to,from}_iter() operations set the
precedent that dax ops should not be needed when the operation can be
carried out generically. The only need to call back to the pmem driver
is so that it can call nvdimm_clear_poison(). nvdimm_clear_poison() in
turn only needs the 'struct device' hosting the pmem and the physical
address to be cleared. The physical address is already returned by
dax_direct_access(). The device is something that could be added to
dax_device, and the pgmap could host the callback that pmem fills in.
Something like:


diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 58eda16f5c53..36486ba4753a 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -694,6 +694,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn
*nd_pfn, struct dev_pagemap *pgmap)
.end = nsio->res.end - end_trunc,
};
pgmap->nr_range = 1;
+ pgmap->owner = &nd_pfn->dev;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < reserve)
return -EINVAL;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 58d95242a836..95e1b6326f88 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -481,6 +481,7 @@ static int pmem_attach_disk(struct device *dev,
}
set_dax_nocache(dax_dev);
set_dax_nomc(dax_dev);
+ set_dax_pgmap(dax_dev, &pmem->pgmap);
if (is_nvdimm_sync(nd_region))
set_dax_synchronous(dax_dev);
rc = dax_add_host(dax_dev, disk);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1fafcc38acba..8cb59b5df38b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -81,6 +81,11 @@ struct dev_pagemap_ops {

#define PGMAP_ALTMAP_VALID (1 << 0)

+struct dev_pagemap_operations {
+ size_t (*recovery_write)(struct dev_pagemap *pgmap, void *, size_t,
+ struct iov_iter *);
+};
+
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @altmap: pre-allocated/reserved memory for vmemmap allocations
@@ -111,12 +116,15 @@ struct dev_pagemap {
const struct dev_pagemap_ops *ops;
void *owner;
int nr_range;
+ struct dev_pagemap_operations ops;
union {
struct range range;
struct range ranges[0];
};
};

...then DM does not need to be involved in the recovery path, fs/dax.c
just does dax_direct_access(..., DAX_RECOVERY, ...) and then looks up
the pgmap to generically coordinate the recovery_write(). The pmem
driver would be responsible for setting pgmap->recovery_write() to a
function that calls nvdimm_clear_poison().

This arch works for anything that can be described by a pgmap, and
supports error clearing, it need not be limited to the pmem block
driver.