2021-11-09 15:40:23

by Christoph Hellwig

[permalink] [raw]
Subject: decouple DAX from block devices

Hi Dan,

this series decouples the DAX from the block layer so that the
block_device is not needed at all for the DAX I/O path.


2021-11-09 15:40:23

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 03/29] dax: remove CONFIG_DAX_DRIVER

CONFIG_DAX_DRIVER only selects CONFIG_DAX now, so remove it.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/Kconfig | 4 ----
drivers/nvdimm/Kconfig | 2 +-
drivers/s390/block/Kconfig | 2 +-
fs/fuse/Kconfig | 2 +-
4 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index d2834c2cfa10d..954ab14ba7778 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-config DAX_DRIVER
- select DAX
- bool
-
menuconfig DAX
tristate "DAX: direct access to differentiated memory"
select SRCU
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index b7d1eb38b27d4..347fe7afa5830 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -22,7 +22,7 @@ if LIBNVDIMM
config BLK_DEV_PMEM
tristate "PMEM: Persistent memory block device support"
default LIBNVDIMM
- select DAX_DRIVER
+ select DAX
select ND_BTT if BTT
select ND_PFN if NVDIMM_PFN
help
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index d0416dbd0cd81..e3710a762abae 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -5,7 +5,7 @@ comment "S/390 block device drivers"
config DCSSBLK
def_tristate m
select FS_DAX_LIMITED
- select DAX_DRIVER
+ select DAX
prompt "DCSSBLK support"
depends on S390 && BLOCK
help
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 40ce9a1c12e5d..038ed0b9aaa5d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -45,7 +45,7 @@ config FUSE_DAX
select INTERVAL_TREE
depends on VIRTIO_FS
depends on FS_DAX
- depends on DAX_DRIVER
+ depends on DAX
help
This allows bypassing guest page cache and allows mapping host page
cache directly in guest address space.
--
2.30.2

2021-11-09 15:40:23

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 01/29] nvdimm/pmem: move dax_attribute_group from dax to pmem

dax_attribute_group is only used by the pmem driver, and can avoid the
completely pointless lookup by the disk name if moved there. This
leaves just a single caller of dax_get_by_host, so move dax_get_by_host
into the same ifdef block as that caller.

Signed-off-by: Christoph Hellwig <[email protected]>
Reviewed-by: Dan Williams <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Dan Williams <[email protected]>
---
drivers/dax/super.c | 100 ++++++++----------------------------------
drivers/nvdimm/pmem.c | 43 ++++++++++++++++++
include/linux/dax.h | 2 -
3 files changed, 61 insertions(+), 84 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index fc89e91beea7c..b882cf8106ea3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -63,6 +63,24 @@ static int dax_host_hash(const char *host)
return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
}

+#ifdef CONFIG_BLOCK
+#include <linux/blkdev.h>
+
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+ pgoff_t *pgoff)
+{
+ sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
+ phys_addr_t phys_off = (start_sect + sector) * 512;
+
+ if (pgoff)
+ *pgoff = PHYS_PFN(phys_off);
+ if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+ return -EINVAL;
+ return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
+#if IS_ENABLED(CONFIG_FS_DAX)
/**
* dax_get_by_host() - temporary lookup mechanism for filesystem-dax
* @host: alternate name for the device registered by a dax driver
@@ -94,24 +112,6 @@ static struct dax_device *dax_get_by_host(const char *host)
return found;
}

-#ifdef CONFIG_BLOCK
-#include <linux/blkdev.h>
-
-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
- pgoff_t *pgoff)
-{
- sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
- phys_addr_t phys_off = (start_sect + sector) * 512;
-
- if (pgoff)
- *pgoff = PHYS_PFN(phys_off);
- if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
- return -EINVAL;
- return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
-#if IS_ENABLED(CONFIG_FS_DAX)
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
{
if (!blk_queue_dax(bdev->bd_disk->queue))
@@ -231,70 +231,6 @@ enum dax_device_flags {
DAXDEV_SYNC,
};

-static ssize_t write_cache_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
- ssize_t rc;
-
- WARN_ON_ONCE(!dax_dev);
- if (!dax_dev)
- return -ENXIO;
-
- rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
- put_dax(dax_dev);
- return rc;
-}
-
-static ssize_t write_cache_store(struct device *dev,
- struct device_attribute *attr, const char *buf, size_t len)
-{
- bool write_cache;
- int rc = strtobool(buf, &write_cache);
- struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
-
- WARN_ON_ONCE(!dax_dev);
- if (!dax_dev)
- return -ENXIO;
-
- if (rc)
- len = rc;
- else
- dax_write_cache(dax_dev, write_cache);
-
- put_dax(dax_dev);
- return len;
-}
-static DEVICE_ATTR_RW(write_cache);
-
-static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
-{
- struct device *dev = container_of(kobj, typeof(*dev), kobj);
- struct dax_device *dax_dev = dax_get_by_host(dev_name(dev));
-
- WARN_ON_ONCE(!dax_dev);
- if (!dax_dev)
- return 0;
-
-#ifndef CONFIG_ARCH_HAS_PMEM_API
- if (a == &dev_attr_write_cache.attr)
- return 0;
-#endif
- return a->mode;
-}
-
-static struct attribute *dax_attributes[] = {
- &dev_attr_write_cache.attr,
- NULL,
-};
-
-struct attribute_group dax_attribute_group = {
- .name = "dax",
- .attrs = dax_attributes,
- .is_visible = dax_visible,
-};
-EXPORT_SYMBOL_GPL(dax_attribute_group);
-
/**
* dax_direct_access() - translate a device pgoff to an absolute pfn
* @dax_dev: a dax_device instance representing the logical memory range
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c74d7bceb2224..9cc0d0ebfad16 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -327,6 +327,49 @@ static const struct dax_operations pmem_dax_ops = {
.zero_page_range = pmem_dax_zero_page_range,
};

+static ssize_t write_cache_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct pmem_device *pmem = dev_to_disk(dev)->private_data;
+
+ return sprintf(buf, "%d\n", !!dax_write_cache_enabled(pmem->dax_dev));
+}
+
+static ssize_t write_cache_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct pmem_device *pmem = dev_to_disk(dev)->private_data;
+ bool write_cache;
+ int rc;
+
+ rc = strtobool(buf, &write_cache);
+ if (rc)
+ return rc;
+ dax_write_cache(pmem->dax_dev, write_cache);
+ return len;
+}
+static DEVICE_ATTR_RW(write_cache);
+
+static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+#ifndef CONFIG_ARCH_HAS_PMEM_API
+ if (a == &dev_attr_write_cache.attr)
+ return 0;
+#endif
+ return a->mode;
+}
+
+static struct attribute *dax_attributes[] = {
+ &dev_attr_write_cache.attr,
+ NULL,
+};
+
+static const struct attribute_group dax_attribute_group = {
+ .name = "dax",
+ .attrs = dax_attributes,
+ .is_visible = dax_visible,
+};
+
static const struct attribute_group *pmem_attribute_groups[] = {
&dax_attribute_group,
NULL,
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 2619d94c308d4..8623caa673889 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,8 +38,6 @@ struct dax_operations {
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
};

-extern struct attribute_group dax_attribute_group;
-
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const char *host,
const struct dax_operations *ops, unsigned long flags);
--
2.30.2

2021-11-09 15:40:32

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 04/29] dax: simplify the dax_device <-> gendisk association

Replace the dax_host_hash with an xarray indexed by the pointer value
of the gendisk, and require explicitly calls from the block drivers that
want to associate their gendisk with a dax_device.

Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Mike Snitzer <[email protected]>
---
drivers/dax/bus.c | 6 +-
drivers/dax/super.c | 106 +++++++++--------------------------
drivers/md/dm.c | 6 +-
drivers/nvdimm/pmem.c | 8 ++-
drivers/s390/block/dcssblk.c | 11 +++-
fs/fuse/virtio_fs.c | 2 +-
include/linux/dax.h | 19 +++++--
7 files changed, 62 insertions(+), 96 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 6cc4da4c713d9..bd7af2f7c5b0a 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1323,10 +1323,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
}

/*
- * No 'host' or dax_operations since there is no access to this
- * device outside of mmap of the resulting character device.
+ * No dax_operations since there is no access to this device outside of
+ * mmap of the resulting character device.
*/
- dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
+ dax_dev = alloc_dax(dev_dax, NULL, DAXDEV_F_SYNC);
if (IS_ERR(dax_dev)) {
rc = PTR_ERR(dax_dev);
goto err_alloc_dax;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e20d0cef10a18..9383c11b21853 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -7,10 +7,8 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
-#include <linux/genhd.h>
#include <linux/pfn_t.h>
#include <linux/cdev.h>
-#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/dax.h>
@@ -26,10 +24,8 @@
* @flags: state and boolean properties
*/
struct dax_device {
- struct hlist_node list;
struct inode inode;
struct cdev cdev;
- const char *host;
void *private;
unsigned long flags;
const struct dax_operations *ops;
@@ -42,10 +38,6 @@ static DEFINE_IDA(dax_minor_ida);
static struct kmem_cache *dax_cache __read_mostly;
static struct super_block *dax_superblock __read_mostly;

-#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
-static struct hlist_head dax_host_list[DAX_HASH_SIZE];
-static DEFINE_SPINLOCK(dax_host_lock);
-
int dax_read_lock(void)
{
return srcu_read_lock(&dax_srcu);
@@ -58,13 +50,22 @@ void dax_read_unlock(int id)
}
EXPORT_SYMBOL_GPL(dax_read_unlock);

-static int dax_host_hash(const char *host)
+#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
+#include <linux/blkdev.h>
+
+static DEFINE_XARRAY(dax_hosts);
+
+int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
- return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
+ return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL);
}
+EXPORT_SYMBOL_GPL(dax_add_host);

-#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
-#include <linux/blkdev.h>
+void dax_remove_host(struct gendisk *disk)
+{
+ xa_erase(&dax_hosts, (unsigned long)disk);
+}
+EXPORT_SYMBOL_GPL(dax_remove_host);

int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
pgoff_t *pgoff)
@@ -82,40 +83,23 @@ EXPORT_SYMBOL(bdev_dax_pgoff);

/**
* dax_get_by_host() - temporary lookup mechanism for filesystem-dax
- * @host: alternate name for the device registered by a dax driver
+ * @bdev: block device to find a dax_device for
*/
-static struct dax_device *dax_get_by_host(const char *host)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
{
- struct dax_device *dax_dev, *found = NULL;
- int hash, id;
+ struct dax_device *dax_dev;
+ int id;

- if (!host)
+ if (!blk_queue_dax(bdev->bd_disk->queue))
return NULL;

- hash = dax_host_hash(host);
-
id = dax_read_lock();
- spin_lock(&dax_host_lock);
- hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
- if (!dax_alive(dax_dev)
- || strcmp(host, dax_dev->host) != 0)
- continue;
-
- if (igrab(&dax_dev->inode))
- found = dax_dev;
- break;
- }
- spin_unlock(&dax_host_lock);
+ dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
+ if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
+ dax_dev = NULL;
dax_read_unlock(id);

- return found;
-}
-
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
-{
- if (!blk_queue_dax(bdev->bd_disk->queue))
- return NULL;
- return dax_get_by_host(bdev->bd_disk->disk_name);
+ return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);

@@ -361,12 +345,7 @@ void kill_dax(struct dax_device *dax_dev)
return;

clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
-
synchronize_srcu(&dax_srcu);
-
- spin_lock(&dax_host_lock);
- hlist_del_init(&dax_dev->list);
- spin_unlock(&dax_host_lock);
}
EXPORT_SYMBOL_GPL(kill_dax);

@@ -398,8 +377,6 @@ static struct dax_device *to_dax_dev(struct inode *inode)
static void dax_free_inode(struct inode *inode)
{
struct dax_device *dax_dev = to_dax_dev(inode);
- kfree(dax_dev->host);
- dax_dev->host = NULL;
if (inode->i_rdev)
ida_simple_remove(&dax_minor_ida, iminor(inode));
kmem_cache_free(dax_cache, dax_dev);
@@ -474,54 +451,25 @@ static struct dax_device *dax_dev_get(dev_t devt)
return dax_dev;
}

-static void dax_add_host(struct dax_device *dax_dev, const char *host)
-{
- int hash;
-
- /*
- * Unconditionally init dax_dev since it's coming from a
- * non-zeroed slab cache
- */
- INIT_HLIST_NODE(&dax_dev->list);
- dax_dev->host = host;
- if (!host)
- return;
-
- hash = dax_host_hash(host);
- spin_lock(&dax_host_lock);
- hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
- spin_unlock(&dax_host_lock);
-}
-
-struct dax_device *alloc_dax(void *private, const char *__host,
- const struct dax_operations *ops, unsigned long flags)
+struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
+ unsigned long flags)
{
struct dax_device *dax_dev;
- const char *host;
dev_t devt;
int minor;

- if (ops && !ops->zero_page_range) {
- pr_debug("%s: error: device does not provide dax"
- " operation zero_page_range()\n",
- __host ? __host : "Unknown");
+ if (WARN_ON_ONCE(ops && !ops->zero_page_range))
return ERR_PTR(-EINVAL);
- }
-
- host = kstrdup(__host, GFP_KERNEL);
- if (__host && !host)
- return ERR_PTR(-ENOMEM);

minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
if (minor < 0)
- goto err_minor;
+ return ERR_PTR(-ENOMEM);

devt = MKDEV(MAJOR(dax_devt), minor);
dax_dev = dax_dev_get(devt);
if (!dax_dev)
goto err_dev;

- dax_add_host(dax_dev, host);
dax_dev->ops = ops;
dax_dev->private = private;
if (flags & DAXDEV_F_SYNC)
@@ -531,8 +479,6 @@ struct dax_device *alloc_dax(void *private, const char *__host,

err_dev:
ida_simple_remove(&dax_minor_ida, minor);
- err_minor:
- kfree(host);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(alloc_dax);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 893fca738a3d8..782a076f61f81 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1683,6 +1683,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
bioset_exit(&md->io_bs);

if (md->dax_dev) {
+ dax_remove_host(md->disk);
kill_dax(md->dax_dev);
put_dax(md->dax_dev);
md->dax_dev = NULL;
@@ -1784,10 +1785,11 @@ static struct mapped_device *alloc_dev(int minor)
sprintf(md->disk->disk_name, "dm-%d", minor);

if (IS_ENABLED(CONFIG_FS_DAX)) {
- md->dax_dev = alloc_dax(md, md->disk->disk_name,
- &dm_dax_ops, 0);
+ md->dax_dev = alloc_dax(md, &dm_dax_ops, 0);
if (IS_ERR(md->dax_dev))
goto bad;
+ if (dax_add_host(md->dax_dev, md->disk))
+ goto bad;
}

format_dev_t(md->name, MKDEV(_major, minor));
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9cc0d0ebfad16..8783ad7370856 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -379,6 +379,7 @@ static void pmem_release_disk(void *__pmem)
{
struct pmem_device *pmem = __pmem;

+ dax_remove_host(pmem->disk);
kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
del_gendisk(pmem->disk);
@@ -495,10 +496,11 @@ static int pmem_attach_disk(struct device *dev,

if (is_nvdimm_sync(nd_region))
flags = DAXDEV_F_SYNC;
- dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
- if (IS_ERR(dax_dev)) {
+ dax_dev = alloc_dax(pmem, &pmem_dax_ops, flags);
+ if (IS_ERR(dax_dev))
return PTR_ERR(dax_dev);
- }
+ if (dax_add_host(dax_dev, disk))
+ return -ENOMEM;
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;

diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 27ab888b44d0a..657e492f2bc26 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -687,18 +687,21 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
if (rc)
goto put_dev;

- dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
- &dcssblk_dax_ops, DAXDEV_F_SYNC);
+ dev_info->dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops,
+ DAXDEV_F_SYNC);
if (IS_ERR(dev_info->dax_dev)) {
rc = PTR_ERR(dev_info->dax_dev);
dev_info->dax_dev = NULL;
goto put_dev;
}
+ rc = dax_add_host(dev_info->dax_dev, dev_info->gd);
+ if (rc)
+ goto out_dax;

get_device(&dev_info->dev);
rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL);
if (rc)
- goto out_dax;
+ goto out_dax_host;

switch (dev_info->segment_type) {
case SEG_TYPE_SR:
@@ -714,6 +717,8 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
rc = count;
goto out;

+out_dax_host:
+ dax_remove_host(dev_info->gd);
out_dax:
put_device(&dev_info->dev);
kill_dax(dev_info->dax_dev);
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 94fc874f5de7f..b4c7c7fa987f8 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -850,7 +850,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);

- fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0);
+ fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops, 0);
if (IS_ERR(fs->dax_dev))
return PTR_ERR(fs->dax_dev);

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8623caa673889..e2e9a67004cbd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -11,9 +11,11 @@

typedef unsigned long dax_entry_t;

+struct dax_device;
+struct gendisk;
struct iomap_ops;
struct iomap;
-struct dax_device;
+
struct dax_operations {
/*
* direct_access: translate a device-relative
@@ -39,8 +41,8 @@ struct dax_operations {
};

#if IS_ENABLED(CONFIG_DAX)
-struct dax_device *alloc_dax(void *private, const char *host,
- const struct dax_operations *ops, unsigned long flags);
+struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
+ unsigned long flags);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
@@ -68,7 +70,7 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
return dax_synchronous(dax_dev);
}
#else
-static inline struct dax_device *alloc_dax(void *private, const char *host,
+static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops, unsigned long flags)
{
/*
@@ -107,6 +109,8 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
struct writeback_control;
int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
#if IS_ENABLED(CONFIG_FS_DAX)
+int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
+void dax_remove_host(struct gendisk *disk);
bool generic_fsdax_supported(struct dax_device *dax_dev,
struct block_device *bdev, int blocksize, sector_t start,
sector_t sectors);
@@ -128,6 +132,13 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t st
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
#else
+static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
+{
+ return 0;
+}
+static inline void dax_remove_host(struct gendisk *disk)
+{
+}
#define generic_fsdax_supported NULL

static inline bool dax_supported(struct dax_device *dax_dev,
--
2.30.2

2021-11-09 15:41:59

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 02/29] dm: make the DAX support dependend on CONFIG_FS_DAX

The device mapper DAX support is all hanging off a block device and thus
can't be used with device dax. Make it depend on CONFIG_FS_DAX instead
of CONFIG_DAX_DRIVER. This also means that bdev_dax_pgoff only needs to
be built under CONFIG_FS_DAX now.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/super.c | 6 ++----
drivers/md/dm-linear.c | 2 +-
drivers/md/dm-log-writes.c | 2 +-
drivers/md/dm-stripe.c | 2 +-
drivers/md/dm-writecache.c | 2 +-
drivers/md/dm.c | 2 +-
6 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index b882cf8106ea3..e20d0cef10a18 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -63,7 +63,7 @@ static int dax_host_hash(const char *host)
return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
}

-#ifdef CONFIG_BLOCK
+#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
#include <linux/blkdev.h>

int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
@@ -80,7 +80,6 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
}
EXPORT_SYMBOL(bdev_dax_pgoff);

-#if IS_ENABLED(CONFIG_FS_DAX)
/**
* dax_get_by_host() - temporary lookup mechanism for filesystem-dax
* @host: alternate name for the device registered by a dax driver
@@ -219,8 +218,7 @@ bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
return ret;
}
EXPORT_SYMBOL_GPL(dax_supported);
-#endif /* CONFIG_FS_DAX */
-#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

enum dax_device_flags {
/* !alive + rcu grace period == no new operations / mappings */
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 66ba16713f696..0a260c35aeeed 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -162,7 +162,7 @@ static int linear_iterate_devices(struct dm_target *ti,
return fn(ti, lc->dev, lc->start, ti->len, data);
}

-#if IS_ENABLED(CONFIG_DAX_DRIVER)
+#if IS_ENABLED(CONFIG_FS_DAX)
static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 46de085a96709..524bc536922eb 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -903,7 +903,7 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
limits->io_min = limits->physical_block_size;
}

-#if IS_ENABLED(CONFIG_DAX_DRIVER)
+#if IS_ENABLED(CONFIG_FS_DAX)
static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
struct iov_iter *i)
{
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 6660b6b53d5bf..f084607220293 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -300,7 +300,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}

-#if IS_ENABLED(CONFIG_DAX_DRIVER)
+#if IS_ENABLED(CONFIG_FS_DAX)
static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 017806096b91e..0af464a863fe6 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -38,7 +38,7 @@
#define BITMAP_GRANULARITY PAGE_SIZE
#endif

-#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
+#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_FS_DAX)
#define DM_WRITECACHE_HAS_PMEM
#endif

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 63aa522636585..893fca738a3d8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1783,7 +1783,7 @@ static struct mapped_device *alloc_dev(int minor)
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);

- if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
+ if (IS_ENABLED(CONFIG_FS_DAX)) {
md->dax_dev = alloc_dax(md, md->disk->disk_name,
&dm_dax_ops, 0);
if (IS_ERR(md->dax_dev))
--
2.30.2

2021-11-09 15:42:01

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 06/29] dax: move the partition alignment check into fs_dax_get_by_bdev

fs_dax_get_by_bdev is the primary interface to find a dax device for a
block device, so move the partition alignment check there instead of
wiring it up through ->dax_supported.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/super.c | 23 ++++++-----------------
1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 04fc680542e8d..482fe775324a4 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -93,6 +93,12 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
if (!blk_queue_dax(bdev->bd_disk->queue))
return NULL;

+ if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
+ (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {
+ pr_info("%pg: error: unaligned partition for dax\n", bdev);
+ return NULL;
+ }
+
id = dax_read_lock();
dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
@@ -107,10 +113,6 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
struct block_device *bdev, int blocksize, sector_t start,
sector_t sectors)
{
- pgoff_t pgoff, pgoff_end;
- sector_t last_page;
- int err;
-
if (blocksize != PAGE_SIZE) {
pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
return false;
@@ -121,19 +123,6 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
return false;
}

- err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff);
- if (err) {
- pr_info("%pg: error: unaligned partition for dax\n", bdev);
- return false;
- }
-
- last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512;
- err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
- if (err) {
- pr_info("%pg: error: unaligned partition for dax\n", bdev);
- return false;
- }
-
return true;
}
EXPORT_SYMBOL_GPL(generic_fsdax_supported);
--
2.30.2

2021-11-09 15:42:02

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 05/29] dax: remove the pgmap sanity checks in generic_fsdax_supported

Drivers that register a dax_dev should make sure it works, no need
to double check from the file system.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/super.c | 49 +--------------------------------------------
1 file changed, 1 insertion(+), 48 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 9383c11b21853..04fc680542e8d 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -107,13 +107,9 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
struct block_device *bdev, int blocksize, sector_t start,
sector_t sectors)
{
- bool dax_enabled = false;
pgoff_t pgoff, pgoff_end;
- void *kaddr, *end_kaddr;
- pfn_t pfn, end_pfn;
sector_t last_page;
- long len, len2;
- int err, id;
+ int err;

if (blocksize != PAGE_SIZE) {
pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
@@ -138,49 +134,6 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
return false;
}

- id = dax_read_lock();
- len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
- len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn);
-
- if (len < 1 || len2 < 1) {
- pr_info("%pg: error: dax access failed (%ld)\n",
- bdev, len < 1 ? len : len2);
- dax_read_unlock(id);
- return false;
- }
-
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
- /*
- * An arch that has enabled the pmem api should also
- * have its drivers support pfn_t_devmap()
- *
- * This is a developer warning and should not trigger in
- * production. dax_flush() will crash since it depends
- * on being able to do (page_address(pfn_to_page())).
- */
- WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
- dax_enabled = true;
- } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) {
- struct dev_pagemap *pgmap, *end_pgmap;
-
- pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
- end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL);
- if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX
- && pfn_t_to_page(pfn)->pgmap == pgmap
- && pfn_t_to_page(end_pfn)->pgmap == pgmap
- && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr))
- && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr)))
- dax_enabled = true;
- put_dev_pagemap(pgmap);
- put_dev_pagemap(end_pgmap);
-
- }
- dax_read_unlock(id);
-
- if (!dax_enabled) {
- pr_info("%pg: error: dax support not enabled\n", bdev);
- return false;
- }
return true;
}
EXPORT_SYMBOL_GPL(generic_fsdax_supported);
--
2.30.2

2021-11-09 15:42:17

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 08/29] dax: remove dax_capable

Just open code the block size and dax_dev == NULL checks in the callers.

Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Mike Snitzer <[email protected]>
---
drivers/dax/super.c | 36 ------------------------------------
drivers/md/dm-table.c | 22 +++++++++++-----------
drivers/md/dm.c | 21 ---------------------
drivers/md/dm.h | 4 ----
drivers/nvdimm/pmem.c | 1 -
drivers/s390/block/dcssblk.c | 1 -
fs/erofs/super.c | 11 +++++++----
fs/ext2/super.c | 6 ++++--
fs/ext4/super.c | 9 ++++++---
fs/xfs/xfs_super.c | 21 ++++++++-------------
include/linux/dax.h | 14 --------------
11 files changed, 36 insertions(+), 110 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 482fe775324a4..803942586d1b6 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -108,42 +108,6 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
return dax_dev;
}
EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
-
-bool generic_fsdax_supported(struct dax_device *dax_dev,
- struct block_device *bdev, int blocksize, sector_t start,
- sector_t sectors)
-{
- if (blocksize != PAGE_SIZE) {
- pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
- return false;
- }
-
- if (!dax_dev) {
- pr_debug("%pg: error: dax unsupported by block device\n", bdev);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(generic_fsdax_supported);
-
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
- int blocksize, sector_t start, sector_t len)
-{
- bool ret = false;
- int id;
-
- if (!dax_dev)
- return false;
-
- id = dax_read_lock();
- if (dax_alive(dax_dev) && dax_dev->ops->dax_supported)
- ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize,
- start, len);
- dax_read_unlock(id);
- return ret;
-}
-EXPORT_SYMBOL_GPL(dax_supported);
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

enum dax_device_flags {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bcddc5effd155..f4915a7d5dc84 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -806,12 +806,14 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
EXPORT_SYMBOL_GPL(dm_table_set_type);

/* validate the dax capability of the target device span */
-int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
+static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
- int blocksize = *(int *) data;
+ if (dev->dax_dev)
+ return false;

- return !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
+ DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev);
+ return true;
}

/* Check devices support synchronous DAX */
@@ -821,8 +823,8 @@ static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_de
return !dev->dax_dev || !dax_synchronous(dev->dax_dev);
}

-bool dm_table_supports_dax(struct dm_table *t,
- iterate_devices_callout_fn iterate_fn, int *blocksize)
+static bool dm_table_supports_dax(struct dm_table *t,
+ iterate_devices_callout_fn iterate_fn)
{
struct dm_target *ti;
unsigned i;
@@ -835,7 +837,7 @@ bool dm_table_supports_dax(struct dm_table *t,
return false;

if (!ti->type->iterate_devices ||
- ti->type->iterate_devices(ti, iterate_fn, blocksize))
+ ti->type->iterate_devices(ti, iterate_fn, NULL))
return false;
}

@@ -862,7 +864,6 @@ static int dm_table_determine_type(struct dm_table *t)
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
- int page_size = PAGE_SIZE;

if (t->type != DM_TYPE_NONE) {
/* target already set the table's type */
@@ -906,7 +907,7 @@ static int dm_table_determine_type(struct dm_table *t)
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) ||
+ if (dm_table_supports_dax(t, device_not_dax_capable) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
}
@@ -1976,7 +1977,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
bool wc = false, fua = false;
- int page_size = PAGE_SIZE;
int r;

/*
@@ -2010,9 +2010,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);

- if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) {
+ if (dm_table_supports_dax(t, device_not_dax_capable)) {
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
- if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL))
+ if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
set_dax_synchronous(t->md->dax_dev);
}
else
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 782a076f61f81..282008afc465f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1027,26 +1027,6 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}

-static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
- int blocksize, sector_t start, sector_t len)
-{
- struct mapped_device *md = dax_get_private(dax_dev);
- struct dm_table *map;
- bool ret = false;
- int srcu_idx;
-
- map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- goto out;
-
- ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
-
-out:
- dm_put_live_table(md, srcu_idx);
-
- return ret;
-}
-
static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
@@ -3052,7 +3032,6 @@ static const struct block_device_operations dm_rq_blk_dops = {

static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
- .dax_supported = dm_dax_supported,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
.zero_page_range = dm_dax_zero_page_range,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 742d9c80efe19..9013dc1a7b002 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -73,10 +73,6 @@ bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
-bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
- int *blocksize);
-int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data);

void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8783ad7370856..0d66339875523 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -321,7 +321,6 @@ static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,

static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
- .dax_supported = generic_fsdax_supported,
.copy_from_iter = pmem_copy_from_iter,
.copy_to_iter = pmem_copy_to_iter,
.zero_page_range = pmem_dax_zero_page_range,
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 657e492f2bc26..e65e83764d1ce 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -72,7 +72,6 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,

static const struct dax_operations dcssblk_dax_ops = {
.direct_access = dcssblk_dax_direct_access,
- .dax_supported = generic_fsdax_supported,
.copy_from_iter = dcssblk_dax_copy_from_iter,
.copy_to_iter = dcssblk_dax_copy_to_iter,
.zero_page_range = dcssblk_dax_zero_page_range,
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 6a969b1e0ee6b..0aed886473c8d 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -652,10 +652,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;

- if (test_opt(&sbi->opt, DAX_ALWAYS) &&
- !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(&sbi->opt, DAX_ALWAYS);
+ if (test_opt(&sbi->opt, DAX_ALWAYS)) {
+ BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE);
+
+ if (!sbi->dax_dev) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d8d580b609baa..a964066a80aa7 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -946,11 +946,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);

if (test_opt(sb, DAX)) {
- if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
- bdev_nr_sectors(sb->s_bdev))) {
+ if (!dax_dev) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
clear_opt(sbi->s_mount_opt, DAX);
+ } else if (blocksize != PAGE_SIZE) {
+ ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
+ clear_opt(sbi->s_mount_opt, DAX);
}
}

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a320c54202d95..eb4df43abd76e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4300,9 +4300,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}

- if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
- bdev_nr_sectors(sb->s_bdev)))
- set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ if (dax_dev) {
+ if (blocksize == PAGE_SIZE)
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+ else
+ ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
+ }

if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 875fd3151d6c9..3a45d5caa28d5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -331,28 +331,23 @@ xfs_set_inode_alloc(
return xfs_is_inode32(mp) ? maxagi : agcount;
}

-static bool
-xfs_buftarg_is_dax(
- struct super_block *sb,
- struct xfs_buftarg *bt)
-{
- return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0,
- bdev_nr_sectors(bt->bt_bdev));
-}
-
static int
xfs_setup_dax_always(
struct xfs_mount *mp)
{
- struct super_block *sb = mp->m_super;
-
- if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
- (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
+ if (!mp->m_ddev_targp->bt_daxdev &&
+ (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
xfs_alert(mp,
"DAX unsupported by block device. Turning off DAX.");
goto disable_dax;
}

+ if (mp->m_super->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "DAX not supported for blocksize. Turning off DAX.\n");
+ goto disable_dax;
+ }
+
if (xfs_has_reflink(mp)) {
xfs_alert(mp, "DAX and reflink cannot be used together!");
return -EINVAL;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index e2e9a67004cbd..439c3c70e347b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -111,12 +111,6 @@ int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
#if IS_ENABLED(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
-bool generic_fsdax_supported(struct dax_device *dax_dev,
- struct block_device *bdev, int blocksize, sector_t start,
- sector_t sectors);
-
-bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
- int blocksize, sector_t start, sector_t len);

static inline void fs_put_dax(struct dax_device *dax_dev)
{
@@ -139,14 +133,6 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
static inline void dax_remove_host(struct gendisk *disk)
{
}
-#define generic_fsdax_supported NULL
-
-static inline bool dax_supported(struct dax_device *dax_dev,
- struct block_device *bdev, int blocksize, sector_t start,
- sector_t len)
-{
- return false;
-}

static inline void fs_put_dax(struct dax_device *dax_dev)
{
--
2.30.2

2021-11-09 15:42:25

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 07/29] xfs: factor out a xfs_setup_dax_always helper

Factor out another DAX setup helper to simplify future changes. Also
move the experimental warning after the checks to not clutter the log
too much if the setup failed.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_super.c | 47 +++++++++++++++++++++++++++-------------------
1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e21459f9923a8..875fd3151d6c9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -340,6 +340,32 @@ xfs_buftarg_is_dax(
bdev_nr_sectors(bt->bt_bdev));
}

+static int
+xfs_setup_dax_always(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
+ (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
+ xfs_alert(mp,
+ "DAX unsupported by block device. Turning off DAX.");
+ goto disable_dax;
+ }
+
+ if (xfs_has_reflink(mp)) {
+ xfs_alert(mp, "DAX and reflink cannot be used together!");
+ return -EINVAL;
+ }
+
+ xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ return 0;
+
+disable_dax:
+ xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
+ return 0;
+}
+
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -1593,26 +1619,9 @@ xfs_fs_fill_super(
sb->s_flags |= SB_I_VERSION;

if (xfs_has_dax_always(mp)) {
- bool rtdev_is_dax = false, datadev_is_dax;
-
- xfs_warn(mp,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
-
- datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp);
- if (mp->m_rtdev_targp)
- rtdev_is_dax = xfs_buftarg_is_dax(sb,
- mp->m_rtdev_targp);
- if (!rtdev_is_dax && !datadev_is_dax) {
- xfs_alert(mp,
- "DAX unsupported by block device. Turning off DAX.");
- xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
- }
- if (xfs_has_reflink(mp)) {
- xfs_alert(mp,
- "DAX and reflink cannot be used together!");
- error = -EINVAL;
+ error = xfs_setup_dax_always(mp);
+ if (error)
goto out_filestream_unmount;
- }
}

if (xfs_has_discard(mp)) {
--
2.30.2

2021-11-09 15:43:37

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 09/29] dm-linear: add a linear_dax_pgoff helper

Add a helper to perform the entire remapping for DAX accesses. This
helper open codes bdev_dax_pgoff given that the alignment checks have
already been done by the submitting file system and don't need to be
repeated.

Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Mike Snitzer <[email protected]>
---
drivers/md/dm-linear.c | 49 +++++++++++++-----------------------------
1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 0a260c35aeeed..90de42f6743ac 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -163,63 +163,44 @@ static int linear_iterate_devices(struct dm_target *ti,
}

#if IS_ENABLED(CONFIG_FS_DAX)
+static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
+{
+ struct linear_c *lc = ti->private;
+ sector_t sector = linear_map_sector(ti, *pgoff << PAGE_SECTORS_SHIFT);
+
+ *pgoff = (get_start_sect(lc->dev->bdev) + sector) >> PAGE_SECTORS_SHIFT;
+ return lc->dev->dax_dev;
+}
+
static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
- long ret;
- struct linear_c *lc = ti->private;
- struct block_device *bdev = lc->dev->bdev;
- struct dax_device *dax_dev = lc->dev->dax_dev;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-
- dev_sector = linear_map_sector(ti, sector);
- ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
- if (ret)
- return ret;
+ struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
+
return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
}

static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- struct linear_c *lc = ti->private;
- struct block_device *bdev = lc->dev->bdev;
- struct dax_device *dax_dev = lc->dev->dax_dev;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);

- dev_sector = linear_map_sector(ti, sector);
- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}

static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- struct linear_c *lc = ti->private;
- struct block_device *bdev = lc->dev->bdev;
- struct dax_device *dax_dev = lc->dev->dax_dev;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+ struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);

- dev_sector = linear_map_sector(ti, sector);
- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}

static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
size_t nr_pages)
{
- int ret;
- struct linear_c *lc = ti->private;
- struct block_device *bdev = lc->dev->bdev;
- struct dax_device *dax_dev = lc->dev->dax_dev;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-
- dev_sector = linear_map_sector(ti, sector);
- ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
- if (ret)
- return ret;
+ struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
+
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

--
2.30.2

2021-11-09 15:43:37

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 10/29] dm-log-writes: add a log_writes_dax_pgoff helper

Add a helper to perform the entire remapping for DAX accesses. This
helper open codes bdev_dax_pgoff given that the alignment checks have
already been done by the submitting file system and don't need to be
repeated.

Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Mike Snitzer <[email protected]>
---
drivers/md/dm-log-writes.c | 42 +++++++++++++++-----------------------
1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 524bc536922eb..df3cd78223fb2 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -949,17 +949,21 @@ static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
return 0;
}

+static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
+ pgoff_t *pgoff)
+{
+ struct log_writes_c *lc = ti->private;
+
+ *pgoff += (get_start_sect(lc->dev->bdev) >> PAGE_SECTORS_SHIFT);
+ return lc->dev->dax_dev;
+}
+
static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
- struct log_writes_c *lc = ti->private;
- sector_t sector = pgoff * PAGE_SECTORS;
- int ret;
+ struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

- ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
- if (ret)
- return ret;
- return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
+ return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
}

static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
@@ -968,11 +972,9 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
{
struct log_writes_c *lc = ti->private;
sector_t sector = pgoff * PAGE_SECTORS;
+ struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
int err;

- if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
-
/* Don't bother doing anything if logging has been disabled */
if (!lc->logging_enabled)
goto dax_copy;
@@ -983,34 +985,24 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
return 0;
}
dax_copy:
- return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+ return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}

static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
pgoff_t pgoff, void *addr, size_t bytes,
struct iov_iter *i)
{
- struct log_writes_c *lc = ti->private;
- sector_t sector = pgoff * PAGE_SECTORS;
+ struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

- if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
- return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
+ return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}

static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
size_t nr_pages)
{
- int ret;
- struct log_writes_c *lc = ti->private;
- sector_t sector = pgoff * PAGE_SECTORS;
+ struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);

- ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
- &pgoff);
- if (ret)
- return ret;
- return dax_zero_page_range(lc->dev->dax_dev, pgoff,
- nr_pages << PAGE_SHIFT);
+ return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
}

#else
--
2.30.2

2021-11-09 15:55:35

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 13/29] fsdax: use a saner calling convention for copy_cow_page_dax

Just pass the vm_fault and iomap_iter structures, and figure out the rest
locally. Note that this requires moving dax_iomap_sector up in the file.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 73bd1439d8089..e51b4129d1b65 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -709,26 +709,31 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_entry(mapping, index, false);
}

-static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, struct page *to, unsigned long vaddr)
+static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
{
+ return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+}
+
+static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
+{
+ sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
void *vto, *kaddr;
pgoff_t pgoff;
long rc;
int id;

- rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
+ rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
if (rc)
return rc;

id = dax_read_lock();
- rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
- vto = kmap_atomic(to);
- copy_user_page(vto, kaddr, vaddr, to);
+ vto = kmap_atomic(vmf->cow_page);
+ copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
kunmap_atomic(vto);
dax_read_unlock(id);
return 0;
@@ -1005,11 +1010,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);

-static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
-{
- return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
-}
-
static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
@@ -1332,19 +1332,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
const struct iomap_iter *iter)
{
- sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
- unsigned long vaddr = vmf->address;
vm_fault_t ret;
int error = 0;

switch (iter->iomap.type) {
case IOMAP_HOLE:
case IOMAP_UNWRITTEN:
- clear_user_highpage(vmf->cow_page, vaddr);
+ clear_user_highpage(vmf->cow_page, vmf->address);
break;
case IOMAP_MAPPED:
- error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
- sector, vmf->cow_page, vaddr);
+ error = copy_cow_page_dax(vmf, iter);
break;
default:
WARN_ON_ONCE(1);
--
2.30.2

2021-11-09 15:55:35

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 11/29] dm-stripe: add a stripe_dax_pgoff helper

Add a helper to perform the entire remapping for DAX accesses. This
helper open codes bdev_dax_pgoff given that the alignment checks have
already been done by the submitting file system and don't need to be
repeated.

Signed-off-by: Christoph Hellwig <[email protected]>
Acked-by: Mike Snitzer <[email protected]>
---
drivers/md/dm-stripe.c | 63 ++++++++++--------------------------------
1 file changed, 15 insertions(+), 48 deletions(-)

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f084607220293..50dba3f39274c 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -301,83 +301,50 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
}

#if IS_ENABLED(CONFIG_FS_DAX)
-static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
- long nr_pages, void **kaddr, pfn_t *pfn)
+static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
{
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
struct stripe_c *sc = ti->private;
- struct dax_device *dax_dev;
struct block_device *bdev;
+ sector_t dev_sector;
uint32_t stripe;
- long ret;

- stripe_map_sector(sc, sector, &stripe, &dev_sector);
+ stripe_map_sector(sc, *pgoff * PAGE_SECTORS, &stripe, &dev_sector);
dev_sector += sc->stripe[stripe].physical_start;
- dax_dev = sc->stripe[stripe].dev->dax_dev;
bdev = sc->stripe[stripe].dev->bdev;

- ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
- if (ret)
- return ret;
+ *pgoff = (get_start_sect(bdev) + dev_sector) >> PAGE_SECTORS_SHIFT;
+ return sc->stripe[stripe].dev->dax_dev;
+}
+
+static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+ long nr_pages, void **kaddr, pfn_t *pfn)
+{
+ struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);
+
return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
}

static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
- struct stripe_c *sc = ti->private;
- struct dax_device *dax_dev;
- struct block_device *bdev;
- uint32_t stripe;
-
- stripe_map_sector(sc, sector, &stripe, &dev_sector);
- dev_sector += sc->stripe[stripe].physical_start;
- dax_dev = sc->stripe[stripe].dev->dax_dev;
- bdev = sc->stripe[stripe].dev->bdev;
+ struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}

static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
- struct stripe_c *sc = ti->private;
- struct dax_device *dax_dev;
- struct block_device *bdev;
- uint32_t stripe;
-
- stripe_map_sector(sc, sector, &stripe, &dev_sector);
- dev_sector += sc->stripe[stripe].physical_start;
- dax_dev = sc->stripe[stripe].dev->dax_dev;
- bdev = sc->stripe[stripe].dev->bdev;
+ struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

- if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
- return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}

static int stripe_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
size_t nr_pages)
{
- int ret;
- sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
- struct stripe_c *sc = ti->private;
- struct dax_device *dax_dev;
- struct block_device *bdev;
- uint32_t stripe;
+ struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);

- stripe_map_sector(sc, sector, &stripe, &dev_sector);
- dev_sector += sc->stripe[stripe].physical_start;
- dax_dev = sc->stripe[stripe].dev->dax_dev;
- bdev = sc->stripe[stripe].dev->bdev;
-
- ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages << PAGE_SHIFT, &pgoff);
- if (ret)
- return ret;
return dax_zero_page_range(dax_dev, pgoff, nr_pages);
}

--
2.30.2

2021-11-09 15:56:16

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 12/29] fsdax: remove a pointless __force cast in copy_cow_page_dax

Despite its name copy_user_page expected kernel addresses, which is what
we already have.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 4e3e5a283a916..73bd1439d8089 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -728,7 +728,7 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
return rc;
}
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)kaddr, vaddr, to);
+ copy_user_page(vto, kaddr, vaddr, to);
kunmap_atomic(vto);
dax_read_unlock(id);
return 0;
--
2.30.2

2021-11-09 15:56:17

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 16/29] fsdax: simplify the offset check in dax_iomap_zero

The file relative offset must have the same alignment as the storage
offset, so use that and get rid of the call to iomap_sector.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5364549d67a48..d7a923d152240 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1123,7 +1123,6 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,

s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
- sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
long rc, id;
void *kaddr;
@@ -1131,8 +1130,7 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);

- if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
- (size == PAGE_SIZE))
+ if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
page_aligned = true;

id = dax_read_lock();
--
2.30.2

2021-11-09 15:56:43

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 14/29] fsdax: simplify the pgoff calculation

Replace the two steps of dax_iomap_sector and bdev_dax_pgoff with a
single dax_iomap_pgoff helper that avoids lots of cumbersome sector
conversions.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/super.c | 14 --------------
fs/dax.c | 35 ++++++++++-------------------------
include/linux/dax.h | 1 -
3 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 803942586d1b6..c0910687fbcb2 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -67,20 +67,6 @@ void dax_remove_host(struct gendisk *disk)
}
EXPORT_SYMBOL_GPL(dax_remove_host);

-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
- pgoff_t *pgoff)
-{
- sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
- phys_addr_t phys_off = (start_sect + sector) * 512;
-
- if (pgoff)
- *pgoff = PHYS_PFN(phys_off);
- if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
- return -EINVAL;
- return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
/**
* dax_get_by_host() - temporary lookup mechanism for filesystem-dax
* @bdev: block device to find a dax_device for
diff --git a/fs/dax.c b/fs/dax.c
index e51b4129d1b65..5364549d67a48 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -709,23 +709,22 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_entry(mapping, index, false);
}

-static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
+static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
- return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
+ phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
+
+ if (iomap->bdev)
+ paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
+ return PHYS_PFN(paddr);
}

static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
{
- sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
+ pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
void *vto, *kaddr;
- pgoff_t pgoff;
long rc;
int id;

- rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
-
id = dax_read_lock();
rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
if (rc < 0) {
@@ -1013,14 +1012,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
- const sector_t sector = dax_iomap_sector(iomap, pos);
- pgoff_t pgoff;
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc;
long length;

- rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
- if (rc)
- return rc;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
NULL, pfnp);
@@ -1129,7 +1124,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
- pgoff_t pgoff;
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
long rc, id;
void *kaddr;
bool page_aligned = false;
@@ -1140,10 +1135,6 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
(size == PAGE_SIZE))
page_aligned = true;

- rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
- if (rc)
- return rc;
-
id = dax_read_lock();

if (page_aligned)
@@ -1169,7 +1160,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
const struct iomap *iomap = &iomi->iomap;
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
@@ -1203,9 +1193,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
- const sector_t sector = dax_iomap_sector(iomap, pos);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
ssize_t map_len;
- pgoff_t pgoff;
void *kaddr;

if (fatal_signal_pending(current)) {
@@ -1213,10 +1202,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}

- ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
- if (ret)
- break;
-
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, NULL);
if (map_len < 0) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 439c3c70e347b..324363b798ecd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -107,7 +107,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
#endif

struct writeback_control;
-int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
#if IS_ENABLED(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
--
2.30.2

2021-11-09 15:58:00

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 15/29] xfs: add xfs_zero_range and xfs_truncate_page helpers

From: Shiyang Ruan <[email protected]>

Add helpers to prepare for using different DAX operations.

Signed-off-by: Shiyang Ruan <[email protected]>
[hch: split from a larger patch + slight cleanups]
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_bmap_util.c | 7 +++----
fs/xfs/xfs_file.c | 3 +--
fs/xfs/xfs_iomap.c | 25 +++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 4 ++++
fs/xfs/xfs_iops.c | 7 +++----
fs/xfs/xfs_reflink.c | 3 +--
6 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 73a36b7be3bd1..797ea0c8b14e1 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1001,7 +1001,7 @@ xfs_free_file_space(

/*
* Now that we've unmap all full blocks we'll have to zero out any
- * partial block at the beginning and/or end. iomap_zero_range is smart
+ * partial block at the beginning and/or end. xfs_zero_range is smart
* enough to skip any holes, including those we just created, but we
* must take care not to zero beyond EOF and enlarge i_size.
*/
@@ -1009,15 +1009,14 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, offset, len, NULL);
if (error)
return error;

/*
* If we zeroed right up to EOF and EOF straddles a page boundary we
* must make sure that the post-EOF area is also zeroed because the
- * page could be mmap'd and iomap_zero_range doesn't do that for us.
+ * page could be mmap'd and xfs_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 27594738b0d18..8d4c5ca261bd7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -437,8 +437,7 @@ xfs_file_write_checks(
}

trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
} else
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 093758440ad53..d6d71ae9f2ae4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1311,3 +1311,28 @@ xfs_xattr_iomap_begin(
const struct iomap_ops xfs_xattr_iomap_ops = {
.iomap_begin = xfs_xattr_iomap_begin,
};
+
+int
+xfs_zero_range(
+ struct xfs_inode *ip,
+ loff_t pos,
+ loff_t len,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ return iomap_zero_range(inode, pos, len, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
+
+int
+xfs_truncate_page(
+ struct xfs_inode *ip,
+ loff_t pos,
+ bool *did_zero)
+{
+ struct inode *inode = VFS_I(ip);
+
+ return iomap_truncate_page(inode, pos, did_zero,
+ &xfs_buffered_write_iomap_ops);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 7d3703556d0e0..f1a281ab9328c 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -20,6 +20,10 @@ xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *, u16);

+int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+ bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+
static inline xfs_filblks_t
xfs_aligned_fsb_count(
xfs_fileoff_t offset_fsb,
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a607d6aca5c4d..ab5ef52b2a9ff 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -911,8 +911,8 @@ xfs_setattr_size(
*/
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
- error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_buffered_write_iomap_ops);
+ error = xfs_zero_range(ip, oldsize, newsize - oldsize,
+ &did_zeroing);
} else {
/*
* iomap won't detect a dirty page over an unwritten block (or a
@@ -924,8 +924,7 @@ xfs_setattr_size(
newsize);
if (error)
return error;
- error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_buffered_write_iomap_ops);
+ error = xfs_truncate_page(ip, newsize, &did_zeroing);
}

if (error)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cb0edb1d68ef1..facce5c076d83 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1269,8 +1269,7 @@ xfs_reflink_zero_posteof(
return 0;

trace_xfs_zero_eof(ip, isize, pos - isize);
- return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_buffered_write_iomap_ops);
+ return xfs_zero_range(ip, isize, pos - isize, NULL);
}

/*
--
2.30.2

2021-11-09 15:58:00

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 18/29] fsdax: decouple zeroing from the iomap buffered I/O code

Unshare the DAX and iomap buffered I/O page zeroing code. This code
previously did a IS_DAX check deep inside the iomap code, which in
fact was the only DAX check in the code. Instead move these checks
into the callers. Most callers already have DAX special casing anyway
and XFS will need it for reflink support as well.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 77 ++++++++++++++++++++++++++++++++++--------
fs/ext2/inode.c | 6 ++--
fs/ext4/inode.c | 4 +--
fs/iomap/buffered-io.c | 35 +++++++------------
fs/xfs/xfs_iomap.c | 6 ++++
include/linux/dax.h | 6 +++-
6 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index dc9ebeff850ab..5b52b878124ac 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1135,24 +1135,73 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
return rc;
}

-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
+static loff_t dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- long rc, id;
- unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ const struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
+ loff_t written = 0;
+
+ /* already zeroed? we're done. */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ do {
+ unsigned offset = offset_in_page(pos);
+ unsigned size = min_t(u64, PAGE_SIZE - offset, length);
+ pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
+ long rc;
+ int id;

- id = dax_read_lock();
- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
- else
- rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
- dax_read_unlock(id);
+ id = dax_read_lock();
+ if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
+ rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ else
+ rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
+ dax_read_unlock(id);

- if (rc < 0)
- return rc;
- return size;
+ if (rc < 0)
+ return rc;
+ pos += size;
+ length -= size;
+ written += size;
+ if (did_zero)
+ *did_zero = true;
+ } while (length > 0);
+
+ return written;
+}
+
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_ZERO,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_zero_iter(&iter, did_zero);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_zero_range);
+
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops)
+{
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int off = pos & (blocksize - 1);
+
+ /* Block boundary? Nothing to do */
+ if (!off)
+ return 0;
+ return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
}
+EXPORT_SYMBOL_GPL(dax_truncate_page);

static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 333fa62661d56..ae9993018a015 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1297,9 +1297,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);

if (IS_DAX(inode)) {
- error = iomap_zero_range(inode, newsize,
- PAGE_ALIGN(newsize) - newsize, NULL,
- &ext2_iomap_ops);
+ error = dax_zero_range(inode, newsize,
+ PAGE_ALIGN(newsize) - newsize, NULL,
+ &ext2_iomap_ops);
} else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f06305167d5a..8c443b753b815 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3783,8 +3783,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;

if (IS_DAX(inode)) {
- return iomap_zero_range(inode, from, length, NULL,
- &ext4_iomap_ops);
+ return dax_zero_range(inode, from, length, NULL,
+ &ext4_iomap_ops);
}
return __ext4_block_zero_page_range(handle, mapping, from, length);
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1753c26c8e76e..b1511255b4df8 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -870,26 +870,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);

-static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
-{
- struct page *page;
- int status;
- unsigned offset = offset_in_page(pos);
- unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
-
- status = iomap_write_begin(iter, pos, bytes, &page);
- if (status)
- return status;
-
- zero_user(page, offset, bytes);
- mark_page_accessed(page);
-
- return iomap_write_end(iter, pos, bytes, bytes, page);
-}
-
static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
@@ -900,12 +882,19 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
return length;

do {
- s64 bytes;
+ unsigned offset = offset_in_page(pos);
+ size_t bytes = min_t(u64, PAGE_SIZE - offset, length);
+ struct page *page;
+ int status;

- if (IS_DAX(iter->inode))
- bytes = dax_iomap_zero(pos, length, iomap);
- else
- bytes = __iomap_zero_iter(iter, pos, length);
+ status = iomap_write_begin(iter, pos, bytes, &page);
+ if (status)
+ return status;
+
+ zero_user(page, offset, bytes);
+ mark_page_accessed(page);
+
+ bytes = iomap_write_end(iter, pos, bytes, bytes, page);
if (bytes < 0)
return bytes;

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d6d71ae9f2ae4..604000b6243ec 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1321,6 +1321,9 @@ xfs_zero_range(
{
struct inode *inode = VFS_I(ip);

+ if (IS_DAX(inode))
+ return dax_zero_range(inode, pos, len, did_zero,
+ &xfs_buffered_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
}
@@ -1333,6 +1336,9 @@ xfs_truncate_page(
{
struct inode *inode = VFS_I(ip);

+ if (IS_DAX(inode))
+ return dax_truncate_page(inode, pos, did_zero,
+ &xfs_buffered_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 324363b798ecd..a5cc2f1aa840e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,6 +14,7 @@ typedef unsigned long dax_entry_t;
struct dax_device;
struct gendisk;
struct iomap_ops;
+struct iomap_iter;
struct iomap;

struct dax_operations {
@@ -124,6 +125,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_page(struct page *page);
void dax_unlock_page(struct page *page, dax_entry_t cookie);
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops);
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
@@ -204,7 +209,6 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
-s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
--
2.30.2

2021-11-09 15:58:04

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 21/29] xfs: move dax device handling into xfs_{alloc,free}_buftarg

Hide the DAX device lookup from the xfs_super.c code.

Reviewed-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_buf.c | 8 ++++----
fs/xfs/xfs_buf.h | 4 ++--
fs/xfs/xfs_super.c | 26 +++++---------------------
3 files changed, 11 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 631c5a61d89b7..4d4553ffa7050 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1892,6 +1892,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);

blkdev_issue_flush(btp->bt_bdev);
+ fs_put_dax(btp->bt_daxdev);

kmem_free(btp);
}
@@ -1932,11 +1933,10 @@ xfs_setsize_buftarg_early(
return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}

-xfs_buftarg_t *
+struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- struct dax_device *dax_dev)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;

@@ -1945,7 +1945,7 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = dax_dev;
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev);

/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 6b0200b8007d1..bd7f709f0d232 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -338,8 +338,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
/*
* Handling of buftargs.
*/
-extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, struct dax_device *);
+struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
+ struct block_device *bdev);
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3a45d5caa28d5..7262716afb215 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -391,26 +391,19 @@ STATIC void
xfs_close_devices(
struct xfs_mount *mp)
{
- struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
-
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
- struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;

xfs_free_buftarg(mp->m_logdev_targp);
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
}
if (mp->m_rtdev_targp) {
struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
- struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;

xfs_free_buftarg(mp->m_rtdev_targp);
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
}
xfs_free_buftarg(mp->m_ddev_targp);
- fs_put_dax(dax_ddev);
}

/*
@@ -428,8 +421,6 @@ xfs_open_devices(
struct xfs_mount *mp)
{
struct block_device *ddev = mp->m_super->s_bdev;
- struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
- struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;

@@ -439,8 +430,7 @@ xfs_open_devices(
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- goto out;
- dax_logdev = fs_dax_get_by_bdev(logdev);
+ return error;
}

if (mp->m_rtname) {
@@ -454,25 +444,24 @@ xfs_open_devices(
error = -EINVAL;
goto out_close_rtdev;
}
- dax_rtdev = fs_dax_get_by_bdev(rtdev);
}

/*
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;

if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}

if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -488,14 +477,9 @@ xfs_open_devices(
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
xfs_blkdev_put(rtdev);
- fs_put_dax(dax_rtdev);
out_close_logdev:
- if (logdev && logdev != ddev) {
+ if (logdev && logdev != ddev)
xfs_blkdev_put(logdev);
- fs_put_dax(dax_logdev);
- }
- out:
- fs_put_dax(dax_ddev);
return error;
}

--
2.30.2

2021-11-09 15:58:12

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 20/29] ext4: cleanup the dax handling in ext4_fill_super

Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
the need for the dax_dev local variable.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/ext4/super.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb4df43abd76e..b60401bb1c310 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3879,7 +3879,6 @@ static void ext4_setup_csum_trigger(struct super_block *sb,

static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh, **group_desc;
struct ext4_super_block *es = NULL;
@@ -3910,12 +3909,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if ((data && !orig_data) || !sbi)
goto out_free_base;

- sbi->s_daxdev = dax_dev;
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
if (!sbi->s_blockgroup_lock)
goto out_free_base;

+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);
sb->s_fs_info = sbi;
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@ -4300,7 +4299,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}

- if (dax_dev) {
+ if (sbi->s_daxdev) {
if (blocksize == PAGE_SIZE)
set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
else
@@ -5096,10 +5095,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev );
out_free_base:
kfree(sbi);
kfree(orig_data);
- fs_put_dax(dax_dev);
return err ? err : ret;
}

--
2.30.2

2021-11-09 15:58:11

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 17/29] fsdax: factor out a dax_memzero helper

Factor out a helper for the "manual" zeroing of a DAX range to clean
up dax_iomap_zero a lot.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 36 +++++++++++++++++++-----------------
1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index d7a923d152240..dc9ebeff850ab 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1121,34 +1121,36 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */

+static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
+ unsigned int offset, size_t size)
+{
+ void *kaddr;
+ long rc;
+
+ rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+ if (rc >= 0) {
+ memset(kaddr + offset, 0, size);
+ dax_flush(dax_dev, kaddr + offset, size);
+ }
+ return rc;
+}
+
s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
long rc, id;
- void *kaddr;
- bool page_aligned = false;
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);

- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- page_aligned = true;
-
id = dax_read_lock();
-
- if (page_aligned)
+ if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
- if (rc < 0) {
- dax_read_unlock(id);
- return rc;
- }
-
- if (!page_aligned) {
- memset(kaddr + offset, 0, size);
- dax_flush(iomap->dax_dev, kaddr + offset, size);
- }
+ rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
dax_read_unlock(id);
+
+ if (rc < 0)
+ return rc;
return size;
}

--
2.30.2

2021-11-09 15:58:13

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 24/29] xfs: use xfs_direct_write_iomap_ops for DAX zeroing

While the buffered write iomap ops do work due to the fact that zeroing
never allocates blocks, the DAX zeroing should use the direct ops just
like actual DAX I/O.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_iomap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8cef3b68cba78..704292c6ce0c7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1324,7 +1324,7 @@ xfs_zero_range(

if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_direct_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
}
@@ -1339,7 +1339,7 @@ xfs_truncate_page(

if (IS_DAX(inode))
return dax_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_direct_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
--
2.30.2

2021-11-09 15:58:20

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 19/29] ext2: cleanup the dax handling in ext2_fill_super

Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
the need for the dax_dev local variable.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/ext2/super.c | 12 +++++-------
1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a964066a80aa7..7e23482862e69 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -802,7 +802,6 @@ static unsigned long descriptor_loc(struct super_block *sb,

static int ext2_fill_super(struct super_block *sb, void *data, int silent)
{
- struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
@@ -822,17 +821,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)

sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- goto failed;
+ return -ENOMEM;

sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
if (!sbi->s_blockgroup_lock) {
kfree(sbi);
- goto failed;
+ return -ENOMEM;
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = dax_dev;
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);

spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
@@ -946,7 +945,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);

if (test_opt(sb, DAX)) {
- if (!dax_dev) {
+ if (!sbi->s_daxdev) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
clear_opt(sbi->s_mount_opt, DAX);
@@ -1201,11 +1200,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
failed_mount:
brelse(bh);
failed_sbi:
+ fs_put_dax(sbi->s_daxdev);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
-failed:
- fs_put_dax(dax_dev);
return ret;
}

--
2.30.2

2021-11-09 15:58:23

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 22/29] iomap: add a IOMAP_DAX flag

Add a flag so that the file system can easily detect DAX operations.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 7 ++++---
include/linux/iomap.h | 1 +
2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5b52b878124ac..0bd6cdcbacfc4 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1180,7 +1180,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.inode = inode,
.pos = pos,
.len = len,
- .flags = IOMAP_ZERO,
+ .flags = IOMAP_DAX | IOMAP_ZERO,
};
int ret;

@@ -1308,6 +1308,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
+ .flags = IOMAP_DAX,
};
loff_t done = 0;
int ret;
@@ -1461,7 +1462,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
.inode = mapping->host,
.pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
.len = PAGE_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = 0;
void *entry;
@@ -1570,7 +1571,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct iomap_iter iter = {
.inode = mapping->host,
.len = PMD_SIZE,
- .flags = IOMAP_FAULT,
+ .flags = IOMAP_DAX | IOMAP_FAULT,
};
vm_fault_t ret = VM_FAULT_FALLBACK;
pgoff_t max_pgoff;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d1b08d0ae930..146a7e3e3ea11 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -141,6 +141,7 @@ struct iomap_page_ops {
#define IOMAP_NOWAIT (1 << 5) /* do not block */
#define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
#define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */
+#define IOMAP_DAX (1 << 8) /* DAX mapping */

struct iomap_ops {
/*
--
2.30.2

2021-11-09 15:58:29

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 25/29] dax: return the partition offset from fs_dax_get_by_bdev

Prepare from removing the block_device from the DAX I/O path by returning
the partition offset from fs_dax_get_by_bdev so that the file systems
have it at hand for use during I/O.

Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/dax/super.c | 9 ++++++---
drivers/md/dm.c | 4 ++--
fs/erofs/internal.h | 2 ++
fs/erofs/super.c | 4 ++--
fs/ext2/ext2.h | 1 +
fs/ext2/super.c | 2 +-
fs/ext4/ext4.h | 1 +
fs/ext4/super.c | 2 +-
fs/xfs/xfs_buf.c | 2 +-
fs/xfs/xfs_buf.h | 1 +
include/linux/dax.h | 6 ++++--
11 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c0910687fbcb2..cc32dcf71c116 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -70,17 +70,20 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
/**
* dax_get_by_host() - temporary lookup mechanism for filesystem-dax
* @bdev: block device to find a dax_device for
+ * @start_off: returns the byte offset into the dax_device that @bdev starts
*/
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
{
struct dax_device *dax_dev;
+ u64 part_size;
int id;

if (!blk_queue_dax(bdev->bd_disk->queue))
return NULL;

- if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
- (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {
+ *start_off = get_start_sect(bdev) * SECTOR_SIZE;
+ part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE;
+ if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) {
pr_info("%pg: error: unaligned partition for dax\n", bdev);
return NULL;
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 282008afc465f..5ea6115d19bdc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -637,7 +637,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
struct mapped_device *md)
{
struct block_device *bdev;
-
+ u64 part_off;
int r;

BUG_ON(td->dm_dev.bdev);
@@ -653,7 +653,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
}

td->dm_dev.bdev = bdev;
- td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev);
+ td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
return 0;
}

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 3265688af7f9f..c1e65346e9f15 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -51,6 +51,7 @@ struct erofs_device_info {
char *path;
struct block_device *bdev;
struct dax_device *dax_dev;
+ u64 dax_part_off;

u32 blocks;
u32 mapped_blkaddr;
@@ -109,6 +110,7 @@ struct erofs_sb_info {
#endif /* CONFIG_EROFS_FS_ZIP */
struct erofs_dev_context *devs;
struct dax_device *dax_dev;
+ u64 dax_part_off;
u64 total_blocks;
u32 primarydevice_blocks;

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0aed886473c8d..71efce16024d9 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -312,7 +312,7 @@ static int erofs_init_devices(struct super_block *sb,
goto err_out;
}
dif->bdev = bdev;
- dif->dax_dev = fs_dax_get_by_bdev(bdev);
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off);
dif->blocks = le32_to_cpu(dis->blocks);
dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
sbi->total_blocks += dif->blocks;
@@ -644,7 +644,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)

sb->s_fs_info = sbi;
sbi->opt = ctx->opt;
- sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->dax_part_off);
sbi->devs = ctx->devs;
ctx->devs = NULL;

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3be9dd6412b78..d4f306aa5aceb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,6 +118,7 @@ struct ext2_sb_info {
spinlock_t s_lock;
struct mb_cache *s_ea_block_cache;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
};

static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e23482862e69..94f1fbd7d3ac2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -831,7 +831,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);

spin_lock_init(&sbi->s_lock);
ret = -EINVAL;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3825195539d74..6f01994a1d52f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1696,6 +1696,7 @@ struct ext4_sb_info {
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
+ u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
unsigned long s_simulate_fail;
#endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b60401bb1c310..5a833847c5e65 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3914,7 +3914,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi->s_blockgroup_lock)
goto out_free_base;

- sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);
+ sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off);
sb->s_fs_info = sbi;
sbi->s_sb = sb;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4d4553ffa7050..bbb0fbd34e649 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,7 +1945,7 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
- btp->bt_daxdev = fs_dax_get_by_bdev(bdev);
+ btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);

/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index bd7f709f0d232..edcb6254fa6a8 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -89,6 +89,7 @@ typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
+ u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
size_t bt_meta_sectormask;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index a5cc2f1aa840e..90f95deff504d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -117,7 +117,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
put_dax(dax_dev);
}

-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+ u64 *start_off);
int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc);

@@ -142,7 +143,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
{
}

-static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+ u64 *start_off)
{
return NULL;
}
--
2.30.2

2021-11-09 15:59:15

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 23/29] xfs: use IOMAP_DAX to check for DAX mappings

Use the explicit DAX flag instead of checking the inode flag in the
iomap code.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_iomap.c | 7 ++++---
fs/xfs/xfs_iomap.h | 3 ++-
fs/xfs/xfs_pnfs.c | 2 +-
3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 604000b6243ec..8cef3b68cba78 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -188,6 +188,7 @@ xfs_iomap_write_direct(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
+ unsigned int flags,
struct xfs_bmbt_irec *imap)
{
struct xfs_mount *mp = ip->i_mount;
@@ -229,7 +230,7 @@ xfs_iomap_write_direct(
* the reserve block pool for bmbt block allocation if there is no space
* left but we need to do unwritten extent conversion.
*/
- if (IS_DAX(VFS_I(ip))) {
+ if (flags & IOMAP_DAX) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
if (imap->br_state == XFS_EXT_UNWRITTEN) {
force = true;
@@ -620,7 +621,7 @@ imap_needs_alloc(
imap->br_startblock == DELAYSTARTBLOCK)
return true;
/* we convert unwritten extents before copying the data for DAX */
- if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+ if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
return true;
return false;
}
@@ -826,7 +827,7 @@ xfs_direct_write_iomap_begin(
xfs_iunlock(ip, lockmode);

error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- &imap);
+ flags, &imap);
if (error)
return error;

diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index f1a281ab9328c..5648262a71736 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -12,7 +12,8 @@ struct xfs_inode;
struct xfs_bmbt_irec;

int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
- xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
+ xfs_fileoff_t count_fsb, unsigned int flags,
+ struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 5e1d29d8b2e73..e188e1cf97cc5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -155,7 +155,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);

error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, &imap);
+ end_fsb - offset_fsb, 0, &imap);
if (error)
goto out_unlock;

--
2.30.2

2021-11-09 15:59:17

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 26/29] fsdax: shift partition offset handling into the file systems

Remove the last user of ->bdev in dax.c by requiring the file system to
pass in an address that already includes the DAX offset. As part of the
only set ->bdev or ->daxdev when actually required in the ->iomap_begin
methods.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/dax.c | 6 +-----
fs/erofs/data.c | 11 ++++++++--
fs/erofs/internal.h | 1 +
fs/ext2/inode.c | 8 +++++--
fs/ext4/inode.c | 16 +++++++++-----
fs/xfs/libxfs/xfs_bmap.c | 4 ++--
fs/xfs/xfs_aops.c | 2 +-
fs/xfs/xfs_iomap.c | 45 +++++++++++++++++++++++++---------------
fs/xfs/xfs_iomap.h | 5 +++--
fs/xfs/xfs_pnfs.c | 2 +-
10 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 0bd6cdcbacfc4..2c13c681edf09 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -711,11 +711,7 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,

static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
{
- phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
-
- if (iomap->bdev)
- paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
- return PHYS_PFN(paddr);
+ return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
}

static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0e35ef3f9f3d7..9b1bb177ce303 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -159,6 +159,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
/* primary device by default */
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
+ map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;

if (map->m_deviceid) {
down_read(&devs->rwsem);
@@ -169,6 +170,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
}
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
up_read(&devs->rwsem);
} else if (devs->extra_devices) {
down_read(&devs->rwsem);
@@ -185,6 +187,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
map->m_pa -= startoff;
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
break;
}
}
@@ -215,9 +218,13 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret)
return ret;

- iomap->bdev = mdev.m_bdev;
- iomap->dax_dev = mdev.m_daxdev;
iomap->offset = map.m_la;
+ if (flags & IOMAP_DAX) {
+ iomap->dax_dev = mdev.m_daxdev;
+ iomap->offset += mdev.m_dax_part_off;
+ } else {
+ iomap->bdev = mdev.m_bdev;
+ }
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c1e65346e9f15..5c2a83876220c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -438,6 +438,7 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_dev {
struct block_device *m_bdev;
struct dax_device *m_daxdev;
+ u64 m_dax_part_off;

erofs_off_t m_pa;
unsigned int m_deviceid;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index ae9993018a015..da4c301b43051 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -816,9 +816,11 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;

iomap->flags = 0;
- iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64)first_block << blkbits;
- iomap->dax_dev = sbi->s_daxdev;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = sbi->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;

if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -827,6 +829,8 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64)bno << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += sbi->s_dax_part_off;
iomap->length = (u64)ret << blkbits;
iomap->flags |= IOMAP_F_MERGED;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8c443b753b815..6cbecd7ff9383 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3274,7 +3274,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
- loff_t length)
+ loff_t length, unsigned int flags)
{
u8 blkbits = inode->i_blkbits;

@@ -3291,8 +3291,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;

- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ else
+ iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64) map->m_lblk << blkbits;
iomap->length = (u64) map->m_len << blkbits;

@@ -3312,9 +3314,13 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_UNWRITTEN) {
iomap->type = IOMAP_UNWRITTEN;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else if (map->m_flags & EXT4_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
iomap->addr = (u64) map->m_pblk << blkbits;
+ if (flags & IOMAP_DAX)
+ iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3423,7 +3429,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
out:
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);

return 0;
}
@@ -3543,7 +3549,7 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
delalloc = ext4_iomap_is_delalloc(inode, &map);

set_iomap:
- ext4_set_iomap(inode, iomap, &map, offset, length);
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
if (delalloc && iomap->type == IOMAP_HOLE)
iomap->type = IOMAP_DELALLOC;

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 4dccd4d90622d..74198dd82b035 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4598,7 +4598,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);

ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
*seq = READ_ONCE(ifp->if_seq);

if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c8c15c3c31471..6ac3449a68ba0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -359,7 +359,7 @@ xfs_map_blocks(
isnullstartblock(imap.br_startblock))
goto allocate_blocks;

- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 704292c6ce0c7..74dbf1fd99d39 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,7 +54,8 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- u16 flags)
+ unsigned int flags,
+ u16 iomap_flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -71,16 +72,22 @@ xfs_bmbt_to_iomap(
iomap->type = IOMAP_DELALLOC;
} else {
iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
+ if (flags & IOMAP_DAX)
+ iomap->addr += target->bt_dax_part_off;
+
if (imap->br_state == XFS_EXT_UNWRITTEN)
iomap->type = IOMAP_UNWRITTEN;
else
iomap->type = IOMAP_MAPPED;
+
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = target->bt_bdev;
- iomap->dax_dev = target->bt_daxdev;
- iomap->flags = flags;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = target->bt_daxdev;
+ else
+ iomap->bdev = target->bt_bdev;
+ iomap->flags = iomap_flags;

if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
@@ -801,7 +808,7 @@ xfs_direct_write_iomap_begin(

xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);

allocate_blocks:
error = -EAGAIN;
@@ -832,18 +839,19 @@ xfs_direct_write_iomap_begin(
return error;

trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ iomap_flags | IOMAP_F_NEW);

out_found_cow:
xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);

out_unlock:
if (lockmode)
@@ -1053,23 +1061,24 @@ xfs_buffered_write_iomap_begin(
*/
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);

found_imap:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);

found_cow:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
}

xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);

out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1178,7 +1187,8 @@ xfs_read_iomap_begin(
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+ shared ? IOMAP_F_SHARED : 0);
}

const struct iomap_ops xfs_read_iomap_ops = {
@@ -1237,7 +1247,8 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+ IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1259,7 +1270,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1306,7 +1317,7 @@ xfs_xattr_iomap_begin(
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
}

const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 5648262a71736..fe7a625361d95 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,8 +18,9 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);

-int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, u16);
+int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+ struct xfs_bmbt_irec *imap, unsigned int flags,
+ u16 iomap_flags);

int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index e188e1cf97cc5..d6334abbc0b3e 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -173,7 +173,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);

- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
--
2.30.2

2021-11-09 15:59:31

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 28/29] iomap: build the block based code conditionally

Only build the block based iomap code if CONFIG_BLOCK is set. Currently
that is always the case, but it will change soon.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/Kconfig | 4 ++--
fs/iomap/Makefile | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index a6313a969bc5f..6d608330a096e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER
Enable this to perform validation of the parameter description for a
filesystem when it is registered.

-if BLOCK
-
config FS_IOMAP
bool

+if BLOCK
+
source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 4143a3ff89dbc..fc070184b7faa 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o

iomap-y += trace.o \
- buffered-io.o \
+ iter.o
+iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
fiemap.o \
- iter.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
--
2.30.2

2021-11-09 15:59:48

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 29/29] fsdax: don't require CONFIG_BLOCK

The file system DAX code now does not require the block code. So allow
building a kernel with fuse DAX but not block layer.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/Kconfig | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 6d608330a096e..7a2b11c0b8036 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -42,6 +42,8 @@ source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/zonefs/Kconfig"

+endif # BLOCK
+
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
@@ -89,8 +91,6 @@ config FS_DAX_PMD
config FS_DAX_LIMITED
bool

-endif # BLOCK
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
--
2.30.2

2021-11-09 16:00:09

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 27/29] dax: fix up some of the block device related ifdefs

The DAX device <-> block device association is only enabled if
CONFIG_BLOCK is enabled. Update dax.h to account for that and use
the right conditions for the fs_put_dax stub as well.

Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/dax.h | 41 ++++++++++++++++++++---------------------
1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 90f95deff504d..5568d3dca941b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -108,28 +108,15 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
#endif

struct writeback_control;
-#if IS_ENABLED(CONFIG_FS_DAX)
+#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
-
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
+ u64 *start_off);
static inline void fs_put_dax(struct dax_device *dax_dev)
{
put_dax(dax_dev);
}
-
-struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
- u64 *start_off);
-int dax_writeback_mapping_range(struct address_space *mapping,
- struct dax_device *dax_dev, struct writeback_control *wbc);
-
-struct page *dax_layout_busy_page(struct address_space *mapping);
-struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
-int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops);
-int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
@@ -138,17 +125,29 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
static inline void dax_remove_host(struct gendisk *disk)
{
}
-
-static inline void fs_put_dax(struct dax_device *dax_dev)
-{
-}
-
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
u64 *start_off)
{
return NULL;
}
+static inline void fs_put_dax(struct dax_device *dax_dev)
+{
+}
+#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
+
+#if IS_ENABLED(CONFIG_FS_DAX)
+int dax_writeback_mapping_range(struct address_space *mapping,
+ struct dax_device *dax_dev, struct writeback_control *wbc);

+struct page *dax_layout_busy_page(struct address_space *mapping);
+struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
+dax_entry_t dax_lock_page(struct page *page);
+void dax_unlock_page(struct page *page, dax_entry_t cookie);
+int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+ const struct iomap_ops *ops);
+int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+ const struct iomap_ops *ops);
+#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
return NULL;
--
2.30.2

2021-11-17 17:23:59

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 02/29] dm: make the DAX support dependend on CONFIG_FS_DAX

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> The device mapper DAX support is all hanging off a block device and thus
> can't be used with device dax. Make it depend on CONFIG_FS_DAX instead
> of CONFIG_DAX_DRIVER. This also means that bdev_dax_pgoff only needs to
> be built under CONFIG_FS_DAX now.
>

Applied, fixed the spelling of 'dependent' in the subject and picked
up Mike's Ack from the previous send:

https://lore.kernel.org/r/[email protected]

Christoph, any particular reason you did not pick up the tags from the
last posting?

2021-11-17 17:31:13

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 07/29] xfs: factor out a xfs_setup_dax_always helper

On Tue, Nov 09, 2021 at 09:32:47AM +0100, Christoph Hellwig wrote:
> Factor out another DAX setup helper to simplify future changes. Also
> move the experimental warning after the checks to not clutter the log
> too much if the setup failed.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_super.c | 47 +++++++++++++++++++++++++++-------------------
> 1 file changed, 28 insertions(+), 19 deletions(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index e21459f9923a8..875fd3151d6c9 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -340,6 +340,32 @@ xfs_buftarg_is_dax(
> bdev_nr_sectors(bt->bt_bdev));
> }
>
> +static int
> +xfs_setup_dax_always(
> + struct xfs_mount *mp)
> +{
> + struct super_block *sb = mp->m_super;
> +
> + if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
> + (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
> + xfs_alert(mp,
> + "DAX unsupported by block device. Turning off DAX.");
> + goto disable_dax;
> + }
> +
> + if (xfs_has_reflink(mp)) {
> + xfs_alert(mp, "DAX and reflink cannot be used together!");
> + return -EINVAL;
> + }
> +
> + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> + return 0;
> +
> +disable_dax:
> + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
> + return 0;
> +}
> +
> STATIC int
> xfs_blkdev_get(
> xfs_mount_t *mp,
> @@ -1593,26 +1619,9 @@ xfs_fs_fill_super(
> sb->s_flags |= SB_I_VERSION;
>
> if (xfs_has_dax_always(mp)) {
> - bool rtdev_is_dax = false, datadev_is_dax;
> -
> - xfs_warn(mp,
> - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
> -
> - datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp);
> - if (mp->m_rtdev_targp)
> - rtdev_is_dax = xfs_buftarg_is_dax(sb,
> - mp->m_rtdev_targp);
> - if (!rtdev_is_dax && !datadev_is_dax) {
> - xfs_alert(mp,
> - "DAX unsupported by block device. Turning off DAX.");
> - xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
> - }
> - if (xfs_has_reflink(mp)) {
> - xfs_alert(mp,
> - "DAX and reflink cannot be used together!");
> - error = -EINVAL;
> + error = xfs_setup_dax_always(mp);
> + if (error)
> goto out_filestream_unmount;
> - }
> }
>
> if (xfs_has_discard(mp)) {
> --
> 2.30.2
>

2021-11-17 17:44:02

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 03/29] dax: remove CONFIG_DAX_DRIVER

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> CONFIG_DAX_DRIVER only selects CONFIG_DAX now, so remove it.

Applied.

2021-11-17 17:44:40

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 01/29] nvdimm/pmem: move dax_attribute_group from dax to pmem

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> dax_attribute_group is only used by the pmem driver, and can avoid the
> completely pointless lookup by the disk name if moved there. This
> leaves just a single caller of dax_get_by_host, so move dax_get_by_host
> into the same ifdef block as that caller.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Reviewed-by: Dan Williams <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]
> Signed-off-by: Dan Williams <[email protected]>

This one already made v5.16-rc1.

2021-11-19 06:55:03

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 02/29] dm: make the DAX support dependend on CONFIG_FS_DAX

On Wed, Nov 17, 2021 at 09:23:44AM -0800, Dan Williams wrote:
> Applied, fixed the spelling of 'dependent' in the subject and picked
> up Mike's Ack from the previous send:
>
> https://lore.kernel.org/r/[email protected]
>
> Christoph, any particular reason you did not pick up the tags from the
> last posting?

I thought I did, but apparently I've missed some.

2021-11-19 06:56:50

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 01/29] nvdimm/pmem: move dax_attribute_group from dax to pmem

On Wed, Nov 17, 2021 at 09:44:25AM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
> >
> > dax_attribute_group is only used by the pmem driver, and can avoid the
> > completely pointless lookup by the disk name if moved there. This
> > leaves just a single caller of dax_get_by_host, so move dax_get_by_host
> > into the same ifdef block as that caller.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > Reviewed-by: Dan Williams <[email protected]>
> > Link: https://lore.kernel.org/r/[email protected]
> > Signed-off-by: Dan Williams <[email protected]>
>
> This one already made v5.16-rc1.

Yes, but 5.16-rc1 did not exist yet when I pointed the series.

Note that the series also has a conflict against 5.16-rc1 in pmem.c,
and buildbot pointed out the file systems need explicit dax.h
includes in a few files for some configurations.

The current branch is here, I just did not bother to repost without
any comments:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dax-block-cleanup

no functional changes.

2021-11-19 17:21:21

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 01/29] nvdimm/pmem: move dax_attribute_group from dax to pmem

On Thu, Nov 18, 2021 at 10:56 PM Christoph Hellwig <[email protected]> wrote:
>
> On Wed, Nov 17, 2021 at 09:44:25AM -0800, Dan Williams wrote:
> > On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
> > >
> > > dax_attribute_group is only used by the pmem driver, and can avoid the
> > > completely pointless lookup by the disk name if moved there. This
> > > leaves just a single caller of dax_get_by_host, so move dax_get_by_host
> > > into the same ifdef block as that caller.
> > >
> > > Signed-off-by: Christoph Hellwig <[email protected]>
> > > Reviewed-by: Dan Williams <[email protected]>
> > > Link: https://lore.kernel.org/r/[email protected]
> > > Signed-off-by: Dan Williams <[email protected]>
> >
> > This one already made v5.16-rc1.
>
> Yes, but 5.16-rc1 did not exist yet when I pointed the series.
>
> Note that the series also has a conflict against 5.16-rc1 in pmem.c,
> and buildbot pointed out the file systems need explicit dax.h
> includes in a few files for some configurations.
>
> The current branch is here, I just did not bother to repost without
> any comments:
>
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dax-block-cleanup
>
> no functional changes.

Do you just want to send me a pull request after you add all the acks?

2021-11-23 02:54:23

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 02/29] dm: make the DAX support dependend on CONFIG_FS_DAX

On Thu, Nov 18, 2021 at 10:55 PM Christoph Hellwig <[email protected]> wrote:
>
> On Wed, Nov 17, 2021 at 09:23:44AM -0800, Dan Williams wrote:
> > Applied, fixed the spelling of 'dependent' in the subject and picked
> > up Mike's Ack from the previous send:
> >
> > https://lore.kernel.org/r/[email protected]
> >
> > Christoph, any particular reason you did not pick up the tags from the
> > last posting?
>
> I thought I did, but apparently I've missed some.

I'll reply with the ones I see missing that need carrying over and add
my own reviewed-by then you can send me a pull request when ready,
deal?

2021-11-23 02:54:56

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 03/29] dax: remove CONFIG_DAX_DRIVER

On Wed, Nov 17, 2021 at 9:43 AM Dan Williams <[email protected]> wrote:
>
> On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
> >
> > CONFIG_DAX_DRIVER only selects CONFIG_DAX now, so remove it.
>
> Applied.

Unapplied,

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 03:33:20

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/29] dax: simplify the dax_device <-> gendisk association

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Replace the dax_host_hash with an xarray indexed by the pointer value
> of the gendisk, and require explicitly calls from the block drivers that
> want to associate their gendisk with a dax_device.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>
> ---
> drivers/dax/bus.c | 6 +-
> drivers/dax/super.c | 106 +++++++++--------------------------
> drivers/md/dm.c | 6 +-
> drivers/nvdimm/pmem.c | 8 ++-
> drivers/s390/block/dcssblk.c | 11 +++-
> fs/fuse/virtio_fs.c | 2 +-
> include/linux/dax.h | 19 +++++--
> 7 files changed, 62 insertions(+), 96 deletions(-)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index 6cc4da4c713d9..bd7af2f7c5b0a 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -1323,10 +1323,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> }
>
> /*
> - * No 'host' or dax_operations since there is no access to this
> - * device outside of mmap of the resulting character device.
> + * No dax_operations since there is no access to this device outside of
> + * mmap of the resulting character device.
> */
> - dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
> + dax_dev = alloc_dax(dev_dax, NULL, DAXDEV_F_SYNC);
> if (IS_ERR(dax_dev)) {
> rc = PTR_ERR(dax_dev);
> goto err_alloc_dax;
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index e20d0cef10a18..9383c11b21853 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -7,10 +7,8 @@
> #include <linux/mount.h>
> #include <linux/pseudo_fs.h>
> #include <linux/magic.h>
> -#include <linux/genhd.h>
> #include <linux/pfn_t.h>
> #include <linux/cdev.h>
> -#include <linux/hash.h>
> #include <linux/slab.h>
> #include <linux/uio.h>
> #include <linux/dax.h>
> @@ -26,10 +24,8 @@
> * @flags: state and boolean properties
> */
> struct dax_device {
> - struct hlist_node list;
> struct inode inode;
> struct cdev cdev;
> - const char *host;
> void *private;
> unsigned long flags;
> const struct dax_operations *ops;
> @@ -42,10 +38,6 @@ static DEFINE_IDA(dax_minor_ida);
> static struct kmem_cache *dax_cache __read_mostly;
> static struct super_block *dax_superblock __read_mostly;
>
> -#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
> -static struct hlist_head dax_host_list[DAX_HASH_SIZE];
> -static DEFINE_SPINLOCK(dax_host_lock);
> -
> int dax_read_lock(void)
> {
> return srcu_read_lock(&dax_srcu);
> @@ -58,13 +50,22 @@ void dax_read_unlock(int id)
> }
> EXPORT_SYMBOL_GPL(dax_read_unlock);
>
> -static int dax_host_hash(const char *host)
> +#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
> +#include <linux/blkdev.h>
> +
> +static DEFINE_XARRAY(dax_hosts);
> +
> +int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> {
> - return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
> + return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL);
> }
> +EXPORT_SYMBOL_GPL(dax_add_host);

Is it time to add a "DAX" symbol namespace?

>
> -#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
> -#include <linux/blkdev.h>
> +void dax_remove_host(struct gendisk *disk)
> +{
> + xa_erase(&dax_hosts, (unsigned long)disk);
> +}
> +EXPORT_SYMBOL_GPL(dax_remove_host);
>
> int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
> pgoff_t *pgoff)
> @@ -82,40 +83,23 @@ EXPORT_SYMBOL(bdev_dax_pgoff);
>
> /**
> * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
> - * @host: alternate name for the device registered by a dax driver
> + * @bdev: block device to find a dax_device for
> */
> -static struct dax_device *dax_get_by_host(const char *host)
> +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> {
> - struct dax_device *dax_dev, *found = NULL;
> - int hash, id;
> + struct dax_device *dax_dev;
> + int id;
>
> - if (!host)
> + if (!blk_queue_dax(bdev->bd_disk->queue))
> return NULL;
>
> - hash = dax_host_hash(host);
> -
> id = dax_read_lock();
> - spin_lock(&dax_host_lock);
> - hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
> - if (!dax_alive(dax_dev)
> - || strcmp(host, dax_dev->host) != 0)
> - continue;
> -
> - if (igrab(&dax_dev->inode))
> - found = dax_dev;
> - break;
> - }
> - spin_unlock(&dax_host_lock);
> + dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
> + if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
> + dax_dev = NULL;
> dax_read_unlock(id);
>
> - return found;
> -}
> -
> -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> -{
> - if (!blk_queue_dax(bdev->bd_disk->queue))
> - return NULL;
> - return dax_get_by_host(bdev->bd_disk->disk_name);
> + return dax_dev;
> }
> EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
>
> @@ -361,12 +345,7 @@ void kill_dax(struct dax_device *dax_dev)
> return;
>
> clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
> -
> synchronize_srcu(&dax_srcu);
> -
> - spin_lock(&dax_host_lock);
> - hlist_del_init(&dax_dev->list);
> - spin_unlock(&dax_host_lock);
> }
> EXPORT_SYMBOL_GPL(kill_dax);
>
> @@ -398,8 +377,6 @@ static struct dax_device *to_dax_dev(struct inode *inode)
> static void dax_free_inode(struct inode *inode)
> {
> struct dax_device *dax_dev = to_dax_dev(inode);
> - kfree(dax_dev->host);
> - dax_dev->host = NULL;
> if (inode->i_rdev)
> ida_simple_remove(&dax_minor_ida, iminor(inode));
> kmem_cache_free(dax_cache, dax_dev);
> @@ -474,54 +451,25 @@ static struct dax_device *dax_dev_get(dev_t devt)
> return dax_dev;
> }
>
> -static void dax_add_host(struct dax_device *dax_dev, const char *host)
> -{
> - int hash;
> -
> - /*
> - * Unconditionally init dax_dev since it's coming from a
> - * non-zeroed slab cache
> - */
> - INIT_HLIST_NODE(&dax_dev->list);
> - dax_dev->host = host;
> - if (!host)
> - return;
> -
> - hash = dax_host_hash(host);
> - spin_lock(&dax_host_lock);
> - hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
> - spin_unlock(&dax_host_lock);
> -}
> -
> -struct dax_device *alloc_dax(void *private, const char *__host,
> - const struct dax_operations *ops, unsigned long flags)
> +struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
> + unsigned long flags)
> {
> struct dax_device *dax_dev;
> - const char *host;
> dev_t devt;
> int minor;
>
> - if (ops && !ops->zero_page_range) {
> - pr_debug("%s: error: device does not provide dax"
> - " operation zero_page_range()\n",
> - __host ? __host : "Unknown");
> + if (WARN_ON_ONCE(ops && !ops->zero_page_range))
> return ERR_PTR(-EINVAL);
> - }
> -
> - host = kstrdup(__host, GFP_KERNEL);
> - if (__host && !host)
> - return ERR_PTR(-ENOMEM);
>
> minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
> if (minor < 0)
> - goto err_minor;
> + return ERR_PTR(-ENOMEM);
>
> devt = MKDEV(MAJOR(dax_devt), minor);
> dax_dev = dax_dev_get(devt);
> if (!dax_dev)
> goto err_dev;
>
> - dax_add_host(dax_dev, host);
> dax_dev->ops = ops;
> dax_dev->private = private;
> if (flags & DAXDEV_F_SYNC)
> @@ -531,8 +479,6 @@ struct dax_device *alloc_dax(void *private, const char *__host,
>
> err_dev:
> ida_simple_remove(&dax_minor_ida, minor);
> - err_minor:
> - kfree(host);
> return ERR_PTR(-ENOMEM);
> }
> EXPORT_SYMBOL_GPL(alloc_dax);
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 893fca738a3d8..782a076f61f81 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1683,6 +1683,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
> bioset_exit(&md->io_bs);
>
> if (md->dax_dev) {

Not a problem introduced by this patch, but this needs to be:

if (!IS_ERR_OR_NULL(md->dax_dev)))

...as alloc_dev() calls this after md->dax_dev allocation might have failed.


> + dax_remove_host(md->disk);

> kill_dax(md->dax_dev);
> put_dax(md->dax_dev);
> md->dax_dev = NULL;
> @@ -1784,10 +1785,11 @@ static struct mapped_device *alloc_dev(int minor)
> sprintf(md->disk->disk_name, "dm-%d", minor);
>
> if (IS_ENABLED(CONFIG_FS_DAX)) {
> - md->dax_dev = alloc_dax(md, md->disk->disk_name,
> - &dm_dax_ops, 0);
> + md->dax_dev = alloc_dax(md, &dm_dax_ops, 0);
> if (IS_ERR(md->dax_dev))
> goto bad;
> + if (dax_add_host(md->dax_dev, md->disk))
> + goto bad;
> }
>
> format_dev_t(md->name, MKDEV(_major, minor));
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 9cc0d0ebfad16..8783ad7370856 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -379,6 +379,7 @@ static void pmem_release_disk(void *__pmem)
> {
> struct pmem_device *pmem = __pmem;
>
> + dax_remove_host(pmem->disk);
> kill_dax(pmem->dax_dev);
> put_dax(pmem->dax_dev);
> del_gendisk(pmem->disk);
> @@ -495,10 +496,11 @@ static int pmem_attach_disk(struct device *dev,
>
> if (is_nvdimm_sync(nd_region))
> flags = DAXDEV_F_SYNC;
> - dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
> - if (IS_ERR(dax_dev)) {
> + dax_dev = alloc_dax(pmem, &pmem_dax_ops, flags);
> + if (IS_ERR(dax_dev))
> return PTR_ERR(dax_dev);
> - }
> + if (dax_add_host(dax_dev, disk))
> + return -ENOMEM;

This leaks the dax_dev. Perhaps this wants devm_alloc_dax() and
devm_dax_add_host() rather than piggybacking on the pmem_release_disk
devm action.

Other changes look good.

2021-11-23 03:35:48

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 05/29] dax: remove the pgmap sanity checks in generic_fsdax_supported

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Drivers that register a dax_dev should make sure it works, no need
> to double check from the file system.

Reviewed-by: Dan Williams <[email protected]>

...with a self-reminder to migrate this validation to a unit test to
backstop any future refactoring of the memmap reservation code.

2021-11-23 03:41:14

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 06/29] dax: move the partition alignment check into fs_dax_get_by_bdev

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> fs_dax_get_by_bdev is the primary interface to find a dax device for a
> block device, so move the partition alignment check there instead of
> wiring it up through ->dax_supported.
>

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 03:51:29

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 07/29] xfs: factor out a xfs_setup_dax_always helper

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Factor out another DAX setup helper to simplify future changes. Also
> move the experimental warning after the checks to not clutter the log
> too much if the setup failed.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 03:58:32

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 08/29] dax: remove dax_capable

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Just open code the block size and dax_dev == NULL checks in the callers.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>

You dropped Gao Xiang's reviewed-by:

https://lore.kernel.org/r/[email protected]

...and Darrick's

https://lore.kernel.org/r/20211019154447.GL24282@magnolia

...which had a few more review comments below, otherwise you can also add:

Reviewed-by: Dan Williams <[email protected]>


> ---
> drivers/dax/super.c | 36 ------------------------------------
> drivers/md/dm-table.c | 22 +++++++++++-----------
> drivers/md/dm.c | 21 ---------------------
> drivers/md/dm.h | 4 ----
> drivers/nvdimm/pmem.c | 1 -
> drivers/s390/block/dcssblk.c | 1 -
> fs/erofs/super.c | 11 +++++++----
> fs/ext2/super.c | 6 ++++--
> fs/ext4/super.c | 9 ++++++---
> fs/xfs/xfs_super.c | 21 ++++++++-------------
> include/linux/dax.h | 14 --------------
> 11 files changed, 36 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 482fe775324a4..803942586d1b6 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -108,42 +108,6 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> return dax_dev;
> }
> EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
> -
> -bool generic_fsdax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t sectors)
> -{
> - if (blocksize != PAGE_SIZE) {
> - pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
> - return false;
> - }
> -
> - if (!dax_dev) {
> - pr_debug("%pg: error: dax unsupported by block device\n", bdev);
> - return false;
> - }
> -
> - return true;
> -}
> -EXPORT_SYMBOL_GPL(generic_fsdax_supported);
> -
> -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
> - int blocksize, sector_t start, sector_t len)
> -{
> - bool ret = false;
> - int id;
> -
> - if (!dax_dev)
> - return false;
> -
> - id = dax_read_lock();
> - if (dax_alive(dax_dev) && dax_dev->ops->dax_supported)
> - ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize,
> - start, len);
> - dax_read_unlock(id);
> - return ret;
> -}
> -EXPORT_SYMBOL_GPL(dax_supported);
> #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
>
> enum dax_device_flags {
> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
> index bcddc5effd155..f4915a7d5dc84 100644
> --- a/drivers/md/dm-table.c
> +++ b/drivers/md/dm-table.c
> @@ -806,12 +806,14 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
> EXPORT_SYMBOL_GPL(dm_table_set_type);
>
> /* validate the dax capability of the target device span */
> -int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
> +static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
> sector_t start, sector_t len, void *data)
> {
> - int blocksize = *(int *) data;
> + if (dev->dax_dev)
> + return false;
>
> - return !dax_supported(dev->dax_dev, dev->bdev, blocksize, start, len);
> + DMDEBUG("%pg: error: dax unsupported by block device", dev->bdev);
> + return true;
> }
>
> /* Check devices support synchronous DAX */
> @@ -821,8 +823,8 @@ static int device_not_dax_synchronous_capable(struct dm_target *ti, struct dm_de
> return !dev->dax_dev || !dax_synchronous(dev->dax_dev);
> }
>
> -bool dm_table_supports_dax(struct dm_table *t,
> - iterate_devices_callout_fn iterate_fn, int *blocksize)
> +static bool dm_table_supports_dax(struct dm_table *t,
> + iterate_devices_callout_fn iterate_fn)
> {
> struct dm_target *ti;
> unsigned i;
> @@ -835,7 +837,7 @@ bool dm_table_supports_dax(struct dm_table *t,
> return false;
>
> if (!ti->type->iterate_devices ||
> - ti->type->iterate_devices(ti, iterate_fn, blocksize))
> + ti->type->iterate_devices(ti, iterate_fn, NULL))
> return false;
> }
>
> @@ -862,7 +864,6 @@ static int dm_table_determine_type(struct dm_table *t)
> struct dm_target *tgt;
> struct list_head *devices = dm_table_get_devices(t);
> enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
> - int page_size = PAGE_SIZE;
>
> if (t->type != DM_TYPE_NONE) {
> /* target already set the table's type */
> @@ -906,7 +907,7 @@ static int dm_table_determine_type(struct dm_table *t)
> verify_bio_based:
> /* We must use this table as bio-based */
> t->type = DM_TYPE_BIO_BASED;
> - if (dm_table_supports_dax(t, device_not_dax_capable, &page_size) ||
> + if (dm_table_supports_dax(t, device_not_dax_capable) ||
> (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
> t->type = DM_TYPE_DAX_BIO_BASED;
> }
> @@ -1976,7 +1977,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> struct queue_limits *limits)
> {
> bool wc = false, fua = false;
> - int page_size = PAGE_SIZE;
> int r;
>
> /*
> @@ -2010,9 +2010,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> }
> blk_queue_write_cache(q, wc, fua);
>
> - if (dm_table_supports_dax(t, device_not_dax_capable, &page_size)) {
> + if (dm_table_supports_dax(t, device_not_dax_capable)) {
> blk_queue_flag_set(QUEUE_FLAG_DAX, q);
> - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable, NULL))
> + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
> set_dax_synchronous(t->md->dax_dev);
> }
> else
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 782a076f61f81..282008afc465f 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1027,26 +1027,6 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
> return ret;
> }
>
> -static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
> - int blocksize, sector_t start, sector_t len)
> -{
> - struct mapped_device *md = dax_get_private(dax_dev);
> - struct dm_table *map;
> - bool ret = false;
> - int srcu_idx;
> -
> - map = dm_get_live_table(md, &srcu_idx);
> - if (!map)
> - goto out;
> -
> - ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
> -
> -out:
> - dm_put_live_table(md, srcu_idx);
> -
> - return ret;
> -}
> -
> static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
> void *addr, size_t bytes, struct iov_iter *i)
> {
> @@ -3052,7 +3032,6 @@ static const struct block_device_operations dm_rq_blk_dops = {
>
> static const struct dax_operations dm_dax_ops = {
> .direct_access = dm_dax_direct_access,
> - .dax_supported = dm_dax_supported,
> .copy_from_iter = dm_dax_copy_from_iter,
> .copy_to_iter = dm_dax_copy_to_iter,
> .zero_page_range = dm_dax_zero_page_range,
> diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> index 742d9c80efe19..9013dc1a7b002 100644
> --- a/drivers/md/dm.h
> +++ b/drivers/md/dm.h
> @@ -73,10 +73,6 @@ bool dm_table_bio_based(struct dm_table *t);
> bool dm_table_request_based(struct dm_table *t);
> void dm_table_free_md_mempools(struct dm_table *t);
> struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
> -bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
> - int *blocksize);
> -int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
> - sector_t start, sector_t len, void *data);
>
> void dm_lock_md_type(struct mapped_device *md);
> void dm_unlock_md_type(struct mapped_device *md);
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 8783ad7370856..0d66339875523 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -321,7 +321,6 @@ static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
>
> static const struct dax_operations pmem_dax_ops = {
> .direct_access = pmem_dax_direct_access,
> - .dax_supported = generic_fsdax_supported,
> .copy_from_iter = pmem_copy_from_iter,
> .copy_to_iter = pmem_copy_to_iter,
> .zero_page_range = pmem_dax_zero_page_range,
> diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
> index 657e492f2bc26..e65e83764d1ce 100644
> --- a/drivers/s390/block/dcssblk.c
> +++ b/drivers/s390/block/dcssblk.c
> @@ -72,7 +72,6 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev,
>
> static const struct dax_operations dcssblk_dax_ops = {
> .direct_access = dcssblk_dax_direct_access,
> - .dax_supported = generic_fsdax_supported,
> .copy_from_iter = dcssblk_dax_copy_from_iter,
> .copy_to_iter = dcssblk_dax_copy_to_iter,
> .zero_page_range = dcssblk_dax_zero_page_range,
> diff --git a/fs/erofs/super.c b/fs/erofs/super.c
> index 6a969b1e0ee6b..0aed886473c8d 100644
> --- a/fs/erofs/super.c
> +++ b/fs/erofs/super.c
> @@ -652,10 +652,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
> if (err)
> return err;
>
> - if (test_opt(&sbi->opt, DAX_ALWAYS) &&
> - !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) {
> - errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> - clear_opt(&sbi->opt, DAX_ALWAYS);
> + if (test_opt(&sbi->opt, DAX_ALWAYS)) {
> + BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE);
> +
> + if (!sbi->dax_dev) {
> + errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
> + clear_opt(&sbi->opt, DAX_ALWAYS);
> + }
> }
> sb->s_flags |= SB_RDONLY | SB_NOATIME;
> sb->s_maxbytes = MAX_LFS_FILESIZE;
> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index d8d580b609baa..a964066a80aa7 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -946,11 +946,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
> blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
>
> if (test_opt(sb, DAX)) {
> - if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
> - bdev_nr_sectors(sb->s_bdev))) {
> + if (!dax_dev) {
> ext2_msg(sb, KERN_ERR,
> "DAX unsupported by block device. Turning off DAX.");
> clear_opt(sbi->s_mount_opt, DAX);
> + } else if (blocksize != PAGE_SIZE) {
> + ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");

Per Darrick, drop the '\n".

> + clear_opt(sbi->s_mount_opt, DAX);
> }
> }
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index a320c54202d95..eb4df43abd76e 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -4300,9 +4300,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> goto failed_mount;
> }
>
> - if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0,
> - bdev_nr_sectors(sb->s_bdev)))
> - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
> + if (dax_dev) {
> + if (blocksize == PAGE_SIZE)
> + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
> + else
> + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");

...another one.

> + }
>
> if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
> if (ext4_has_feature_inline_data(sb)) {
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 875fd3151d6c9..3a45d5caa28d5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -331,28 +331,23 @@ xfs_set_inode_alloc(
> return xfs_is_inode32(mp) ? maxagi : agcount;
> }
>
> -static bool
> -xfs_buftarg_is_dax(
> - struct super_block *sb,
> - struct xfs_buftarg *bt)
> -{
> - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0,
> - bdev_nr_sectors(bt->bt_bdev));
> -}
> -
> static int
> xfs_setup_dax_always(
> struct xfs_mount *mp)
> {
> - struct super_block *sb = mp->m_super;
> -
> - if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
> - (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
> + if (!mp->m_ddev_targp->bt_daxdev &&
> + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
> xfs_alert(mp,
> "DAX unsupported by block device. Turning off DAX.");
> goto disable_dax;
> }
>
> + if (mp->m_super->s_blocksize != PAGE_SIZE) {
> + xfs_alert(mp,
> + "DAX not supported for blocksize. Turning off DAX.\n");

...and one more


> + goto disable_dax;
> + }
> +
> if (xfs_has_reflink(mp)) {
> xfs_alert(mp, "DAX and reflink cannot be used together!");
> return -EINVAL;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e2e9a67004cbd..439c3c70e347b 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -111,12 +111,6 @@ int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
> #if IS_ENABLED(CONFIG_FS_DAX)
> int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
> void dax_remove_host(struct gendisk *disk);
> -bool generic_fsdax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t sectors);
> -
> -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
> - int blocksize, sector_t start, sector_t len);
>
> static inline void fs_put_dax(struct dax_device *dax_dev)
> {
> @@ -139,14 +133,6 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> static inline void dax_remove_host(struct gendisk *disk)
> {
> }
> -#define generic_fsdax_supported NULL
> -
> -static inline bool dax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t len)
> -{
> - return false;
> -}
>
> static inline void fs_put_dax(struct dax_device *dax_dev)
> {
> --
> 2.30.2
>

2021-11-23 03:59:39

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 09/29] dm-linear: add a linear_dax_pgoff helper

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Add a helper to perform the entire remapping for DAX accesses. This
> helper open codes bdev_dax_pgoff given that the alignment checks have
> already been done by the submitting file system and don't need to be
> repeated.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 04:02:55

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 10/29] dm-log-writes: add a log_writes_dax_pgoff helper

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Add a helper to perform the entire remapping for DAX accesses. This
> helper open codes bdev_dax_pgoff given that the alignment checks have
> already been done by the submitting file system and don't need to be
> repeated.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 04:16:14

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 11/29] dm-stripe: add a stripe_dax_pgoff helper

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Add a helper to perform the entire remapping for DAX accesses. This
> helper open codes bdev_dax_pgoff given that the alignment checks have
> already been done by the submitting file system and don't need to be
> repeated.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 04:17:27

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 12/29] fsdax: remove a pointless __force cast in copy_cow_page_dax

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Despite its name copy_user_page expected kernel addresses, which is what
> we already have.

Yup,

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 05:56:24

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 02/29] dm: make the DAX support dependend on CONFIG_FS_DAX

On Mon, Nov 22, 2021 at 06:54:09PM -0800, Dan Williams wrote:
> On Thu, Nov 18, 2021 at 10:55 PM Christoph Hellwig <[email protected]> wrote:
> >
> > On Wed, Nov 17, 2021 at 09:23:44AM -0800, Dan Williams wrote:
> > > Applied, fixed the spelling of 'dependent' in the subject and picked
> > > up Mike's Ack from the previous send:
> > >
> > > https://lore.kernel.org/r/[email protected]
> > >
> > > Christoph, any particular reason you did not pick up the tags from the
> > > last posting?
> >
> > I thought I did, but apparently I've missed some.
>
> I'll reply with the ones I see missing that need carrying over and add
> my own reviewed-by then you can send me a pull request when ready,
> deal?

Ok.

2021-11-23 05:57:49

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 04/29] dax: simplify the dax_device <-> gendisk association

On Mon, Nov 22, 2021 at 07:33:06PM -0800, Dan Williams wrote:
> Is it time to add a "DAX" symbol namespace?

What would be the benefit?

2021-11-23 19:34:06

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 04/29] dax: simplify the dax_device <-> gendisk association

On Mon, Nov 22, 2021 at 9:58 PM Christoph Hellwig <[email protected]> wrote:
>
> On Mon, Nov 22, 2021 at 07:33:06PM -0800, Dan Williams wrote:
> > Is it time to add a "DAX" symbol namespace?
>
> What would be the benefit?

Just the small benefit of identifying DAX core users with a common
grep line, and to indicate that DAX exports are more intertwined than
standalone exports, but yeah those are minor.

2021-11-23 19:36:00

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 13/29] fsdax: use a saner calling convention for copy_cow_page_dax

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Just pass the vm_fault and iomap_iter structures, and figure out the rest
> locally. Note that this requires moving dax_iomap_sector up in the file.

Looks good,

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 29 +++++++++++++----------------
> 1 file changed, 13 insertions(+), 16 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 73bd1439d8089..e51b4129d1b65 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -709,26 +709,31 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> return __dax_invalidate_entry(mapping, index, false);
> }
>
> -static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
> - sector_t sector, struct page *to, unsigned long vaddr)
> +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> {
> + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> +}
> +
> +static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
> +{
> + sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> void *vto, *kaddr;
> pgoff_t pgoff;
> long rc;
> int id;
>
> - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
> + rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
> if (rc)
> return rc;
>
> id = dax_read_lock();
> - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
> + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
> if (rc < 0) {
> dax_read_unlock(id);
> return rc;
> }
> - vto = kmap_atomic(to);
> - copy_user_page(vto, kaddr, vaddr, to);
> + vto = kmap_atomic(vmf->cow_page);
> + copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
> kunmap_atomic(vto);
> dax_read_unlock(id);
> return 0;
> @@ -1005,11 +1010,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
> }
> EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
>
> -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> -{
> - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> -}
> -
> static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
> pfn_t *pfnp)
> {
> @@ -1332,19 +1332,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
> static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
> const struct iomap_iter *iter)
> {
> - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> - unsigned long vaddr = vmf->address;
> vm_fault_t ret;
> int error = 0;
>
> switch (iter->iomap.type) {
> case IOMAP_HOLE:
> case IOMAP_UNWRITTEN:
> - clear_user_highpage(vmf->cow_page, vaddr);
> + clear_user_highpage(vmf->cow_page, vmf->address);
> break;
> case IOMAP_MAPPED:
> - error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
> - sector, vmf->cow_page, vaddr);
> + error = copy_cow_page_dax(vmf, iter);
> break;
> default:
> WARN_ON_ONCE(1);
> --
> 2.30.2
>

2021-11-23 19:45:13

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 14/29] fsdax: simplify the pgoff calculation

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Replace the two steps of dax_iomap_sector and bdev_dax_pgoff with a
> single dax_iomap_pgoff helper that avoids lots of cumbersome sector
> conversions.

Looks good,

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> drivers/dax/super.c | 14 --------------
> fs/dax.c | 35 ++++++++++-------------------------
> include/linux/dax.h | 1 -
> 3 files changed, 10 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 803942586d1b6..c0910687fbcb2 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -67,20 +67,6 @@ void dax_remove_host(struct gendisk *disk)
> }
> EXPORT_SYMBOL_GPL(dax_remove_host);
>
> -int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
> - pgoff_t *pgoff)
> -{
> - sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
> - phys_addr_t phys_off = (start_sect + sector) * 512;
> -
> - if (pgoff)
> - *pgoff = PHYS_PFN(phys_off);
> - if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
> - return -EINVAL;
> - return 0;
> -}
> -EXPORT_SYMBOL(bdev_dax_pgoff);
> -
> /**
> * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
> * @bdev: block device to find a dax_device for
> diff --git a/fs/dax.c b/fs/dax.c
> index e51b4129d1b65..5364549d67a48 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -709,23 +709,22 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> return __dax_invalidate_entry(mapping, index, false);
> }
>
> -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> +static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
> {
> - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> + phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
> +
> + if (iomap->bdev)
> + paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
> + return PHYS_PFN(paddr);
> }
>
> static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
> {
> - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> + pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
> void *vto, *kaddr;
> - pgoff_t pgoff;
> long rc;
> int id;
>
> - rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
> - if (rc)
> - return rc;
> -
> id = dax_read_lock();
> rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
> if (rc < 0) {
> @@ -1013,14 +1012,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
> pfn_t *pfnp)
> {
> - const sector_t sector = dax_iomap_sector(iomap, pos);
> - pgoff_t pgoff;
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> int id, rc;
> long length;
>
> - rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
> - if (rc)
> - return rc;
> id = dax_read_lock();
> length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
> NULL, pfnp);
> @@ -1129,7 +1124,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> - pgoff_t pgoff;
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> long rc, id;
> void *kaddr;
> bool page_aligned = false;
> @@ -1140,10 +1135,6 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> (size == PAGE_SIZE))
> page_aligned = true;
>
> - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
> - if (rc)
> - return rc;
> -
> id = dax_read_lock();
>
> if (page_aligned)
> @@ -1169,7 +1160,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> const struct iomap *iomap = &iomi->iomap;
> loff_t length = iomap_length(iomi);
> loff_t pos = iomi->pos;
> - struct block_device *bdev = iomap->bdev;
> struct dax_device *dax_dev = iomap->dax_dev;
> loff_t end = pos + length, done = 0;
> ssize_t ret = 0;
> @@ -1203,9 +1193,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> while (pos < end) {
> unsigned offset = pos & (PAGE_SIZE - 1);
> const size_t size = ALIGN(length + offset, PAGE_SIZE);
> - const sector_t sector = dax_iomap_sector(iomap, pos);
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> ssize_t map_len;
> - pgoff_t pgoff;
> void *kaddr;
>
> if (fatal_signal_pending(current)) {
> @@ -1213,10 +1202,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> break;
> }
>
> - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
> - if (ret)
> - break;
> -
> map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
> &kaddr, NULL);
> if (map_len < 0) {
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 439c3c70e347b..324363b798ecd 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -107,7 +107,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
> #endif
>
> struct writeback_control;
> -int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
> #if IS_ENABLED(CONFIG_FS_DAX)
> int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
> void dax_remove_host(struct gendisk *disk);
> --
> 2.30.2
>

2021-11-23 21:16:05

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 15/29] xfs: add xfs_zero_range and xfs_truncate_page helpers

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> From: Shiyang Ruan <[email protected]>
>
> Add helpers to prepare for using different DAX operations.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> [hch: split from a larger patch + slight cleanups]
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good to me.

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 21:17:22

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 16/29] fsdax: simplify the offset check in dax_iomap_zero

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> The file relative offset must have the same alignment as the storage
> offset, so use that and get rid of the call to iomap_sector.

Agree.

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 21:22:27

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 17/29] fsdax: factor out a dax_memzero helper

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Factor out a helper for the "manual" zeroing of a DAX range to clean
> up dax_iomap_zero a lot.
>

Small / optional fixup below:

Reviewed-by: Dan Williams <[email protected]>

> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 36 +++++++++++++++++++-----------------
> 1 file changed, 19 insertions(+), 17 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index d7a923d152240..dc9ebeff850ab 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1121,34 +1121,36 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> }
> #endif /* CONFIG_FS_DAX_PMD */
>
> +static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
> + unsigned int offset, size_t size)
> +{
> + void *kaddr;
> + long rc;
> +
> + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
> + if (rc >= 0) {

Technically this should be "> 0" because dax_direct_access() returns
nr_available_pages @pgoff, but this isn't broken because
dax_direct_access() converts the "zero pages available" case into
-ERANGE.

> + memset(kaddr + offset, 0, size);
> + dax_flush(dax_dev, kaddr + offset, size);
> + }
> + return rc;
> +}
> +
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> {
> pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> long rc, id;
> - void *kaddr;
> - bool page_aligned = false;
> unsigned offset = offset_in_page(pos);
> unsigned size = min_t(u64, PAGE_SIZE - offset, length);
>
> - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> - page_aligned = true;
> -
> id = dax_read_lock();
> -
> - if (page_aligned)
> + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> else
> - rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> - if (rc < 0) {
> - dax_read_unlock(id);
> - return rc;
> - }
> -
> - if (!page_aligned) {
> - memset(kaddr + offset, 0, size);
> - dax_flush(iomap->dax_dev, kaddr + offset, size);
> - }
> + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> dax_read_unlock(id);
> +
> + if (rc < 0)
> + return rc;
> return size;
> }
>
> --
> 2.30.2
>

2021-11-23 21:46:51

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 18/29] fsdax: decouple zeroing from the iomap buffered I/O code

On Tue, Nov 9, 2021 at 12:33 AM Christoph Hellwig <[email protected]> wrote:
>
> Unshare the DAX and iomap buffered I/O page zeroing code. This code
> previously did a IS_DAX check deep inside the iomap code, which in
> fact was the only DAX check in the code. Instead move these checks
> into the callers. Most callers already have DAX special casing anyway
> and XFS will need it for reflink support as well.

Looks ok, a tangential question below about iomap_truncate_page(), but
you can add:

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 77 ++++++++++++++++++++++++++++++++++--------
> fs/ext2/inode.c | 6 ++--
> fs/ext4/inode.c | 4 +--
> fs/iomap/buffered-io.c | 35 +++++++------------
> fs/xfs/xfs_iomap.c | 6 ++++
> include/linux/dax.h | 6 +++-
> 6 files changed, 91 insertions(+), 43 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index dc9ebeff850ab..5b52b878124ac 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1135,24 +1135,73 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
> return rc;
> }
>
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> +static loff_t dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
> {
> - pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> - long rc, id;
> - unsigned offset = offset_in_page(pos);
> - unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> + const struct iomap *iomap = &iter->iomap;
> + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> + loff_t pos = iter->pos;
> + loff_t length = iomap_length(iter);
> + loff_t written = 0;
> +
> + /* already zeroed? we're done. */
> + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
> + return length;
> +
> + do {
> + unsigned offset = offset_in_page(pos);
> + unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> + long rc;
> + int id;
>
> - id = dax_read_lock();
> - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> - else
> - rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> - dax_read_unlock(id);
> + id = dax_read_lock();
> + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> + else
> + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> + dax_read_unlock(id);
>
> - if (rc < 0)
> - return rc;
> - return size;
> + if (rc < 0)
> + return rc;
> + pos += size;
> + length -= size;
> + written += size;
> + if (did_zero)
> + *did_zero = true;
> + } while (length > 0);
> +
> + return written;
> +}
> +
> +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> + const struct iomap_ops *ops)
> +{
> + struct iomap_iter iter = {
> + .inode = inode,
> + .pos = pos,
> + .len = len,
> + .flags = IOMAP_ZERO,
> + };
> + int ret;
> +
> + while ((ret = iomap_iter(&iter, ops)) > 0)
> + iter.processed = dax_zero_iter(&iter, did_zero);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(dax_zero_range);
> +
> +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> + const struct iomap_ops *ops)
> +{
> + unsigned int blocksize = i_blocksize(inode);
> + unsigned int off = pos & (blocksize - 1);
> +
> + /* Block boundary? Nothing to do */
> + if (!off)
> + return 0;

It took me a moment to figure out why this was correct. I see it was
also copied from iomap_truncate_page(). It makes sense for DAX where
blocksize >= PAGE_SIZE so it's always the case that the amount of
capacity to zero relative to a page is from @pos to the end of the
block. Is there something else that protects the blocksize < PAGE_SIZE
case outside of DAX?

Nothing to change for this patch, just a question I had while reviewing.

> + return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
> }
> +EXPORT_SYMBOL_GPL(dax_truncate_page);
>
> static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> struct iov_iter *iter)
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 333fa62661d56..ae9993018a015 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -1297,9 +1297,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
> inode_dio_wait(inode);
>
> if (IS_DAX(inode)) {
> - error = iomap_zero_range(inode, newsize,
> - PAGE_ALIGN(newsize) - newsize, NULL,
> - &ext2_iomap_ops);
> + error = dax_zero_range(inode, newsize,
> + PAGE_ALIGN(newsize) - newsize, NULL,
> + &ext2_iomap_ops);
> } else if (test_opt(inode->i_sb, NOBH))
> error = nobh_truncate_page(inode->i_mapping,
> newsize, ext2_get_block);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 0f06305167d5a..8c443b753b815 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3783,8 +3783,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
> length = max;
>
> if (IS_DAX(inode)) {
> - return iomap_zero_range(inode, from, length, NULL,
> - &ext4_iomap_ops);
> + return dax_zero_range(inode, from, length, NULL,
> + &ext4_iomap_ops);
> }
> return __ext4_block_zero_page_range(handle, mapping, from, length);
> }
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 1753c26c8e76e..b1511255b4df8 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -870,26 +870,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
> }
> EXPORT_SYMBOL_GPL(iomap_file_unshare);
>
> -static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
> -{
> - struct page *page;
> - int status;
> - unsigned offset = offset_in_page(pos);
> - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
> -
> - status = iomap_write_begin(iter, pos, bytes, &page);
> - if (status)
> - return status;
> -
> - zero_user(page, offset, bytes);
> - mark_page_accessed(page);
> -
> - return iomap_write_end(iter, pos, bytes, bytes, page);
> -}
> -
> static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> {
> - struct iomap *iomap = &iter->iomap;
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> loff_t pos = iter->pos;
> loff_t length = iomap_length(iter);
> @@ -900,12 +882,19 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> return length;
>
> do {
> - s64 bytes;
> + unsigned offset = offset_in_page(pos);
> + size_t bytes = min_t(u64, PAGE_SIZE - offset, length);
> + struct page *page;
> + int status;
>
> - if (IS_DAX(iter->inode))
> - bytes = dax_iomap_zero(pos, length, iomap);
> - else
> - bytes = __iomap_zero_iter(iter, pos, length);
> + status = iomap_write_begin(iter, pos, bytes, &page);
> + if (status)
> + return status;
> +
> + zero_user(page, offset, bytes);
> + mark_page_accessed(page);
> +
> + bytes = iomap_write_end(iter, pos, bytes, bytes, page);
> if (bytes < 0)
> return bytes;
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index d6d71ae9f2ae4..604000b6243ec 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1321,6 +1321,9 @@ xfs_zero_range(
> {
> struct inode *inode = VFS_I(ip);
>
> + if (IS_DAX(inode))
> + return dax_zero_range(inode, pos, len, did_zero,
> + &xfs_buffered_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> @@ -1333,6 +1336,9 @@ xfs_truncate_page(
> {
> struct inode *inode = VFS_I(ip);
>
> + if (IS_DAX(inode))
> + return dax_truncate_page(inode, pos, did_zero,
> + &xfs_buffered_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 324363b798ecd..a5cc2f1aa840e 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -14,6 +14,7 @@ typedef unsigned long dax_entry_t;
> struct dax_device;
> struct gendisk;
> struct iomap_ops;
> +struct iomap_iter;
> struct iomap;
>
> struct dax_operations {
> @@ -124,6 +125,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
> struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
> dax_entry_t dax_lock_page(struct page *page);
> void dax_unlock_page(struct page *page, dax_entry_t cookie);
> +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> + const struct iomap_ops *ops);
> +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> + const struct iomap_ops *ops);
> #else
> static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> {
> @@ -204,7 +209,6 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
> int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> --
> 2.30.2
>

2021-11-23 21:48:48

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 19/29] ext2: cleanup the dax handling in ext2_fill_super

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
> the need for the dax_dev local variable.

Looks good.

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 21:50:05

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 20/29] ext4: cleanup the dax handling in ext4_fill_super

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
> the need for the dax_dev local variable.

Looks good.

Reviewed-by: Dan Williams <[email protected]>

2021-11-23 22:26:00

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 06/29] dax: move the partition alignment check into fs_dax_get_by_bdev

On Tue, Nov 09, 2021 at 09:32:46AM +0100, Christoph Hellwig wrote:
> fs_dax_get_by_bdev is the primary interface to find a dax device for a
> block device, so move the partition alignment check there instead of
> wiring it up through ->dax_supported.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> drivers/dax/super.c | 23 ++++++-----------------
> 1 file changed, 6 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 04fc680542e8d..482fe775324a4 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -93,6 +93,12 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> if (!blk_queue_dax(bdev->bd_disk->queue))
> return NULL;
>
> + if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
> + (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {

Do we have to be careful about 64-bit division here, or do we not
support DAX on 32-bit?

> + pr_info("%pg: error: unaligned partition for dax\n", bdev);

I also wonder if this should be ratelimited...?

--D

> + return NULL;
> + }
> +
> id = dax_read_lock();
> dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
> if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
> @@ -107,10 +113,6 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
> struct block_device *bdev, int blocksize, sector_t start,
> sector_t sectors)
> {
> - pgoff_t pgoff, pgoff_end;
> - sector_t last_page;
> - int err;
> -
> if (blocksize != PAGE_SIZE) {
> pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
> return false;
> @@ -121,19 +123,6 @@ bool generic_fsdax_supported(struct dax_device *dax_dev,
> return false;
> }
>
> - err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff);
> - if (err) {
> - pr_info("%pg: error: unaligned partition for dax\n", bdev);
> - return false;
> - }
> -
> - last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512;
> - err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
> - if (err) {
> - pr_info("%pg: error: unaligned partition for dax\n", bdev);
> - return false;
> - }
> -
> return true;
> }
> EXPORT_SYMBOL_GPL(generic_fsdax_supported);
> --
> 2.30.2
>

2021-11-23 22:31:30

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 08/29] dax: remove dax_capable

On Tue, Nov 09, 2021 at 09:32:48AM +0100, Christoph Hellwig wrote:
> Just open code the block size and dax_dev == NULL checks in the callers.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> Acked-by: Mike Snitzer <[email protected]>
> ---
> drivers/dax/super.c | 36 ------------------------------------
> drivers/md/dm-table.c | 22 +++++++++++-----------
> drivers/md/dm.c | 21 ---------------------
> drivers/md/dm.h | 4 ----
> drivers/nvdimm/pmem.c | 1 -
> drivers/s390/block/dcssblk.c | 1 -
> fs/erofs/super.c | 11 +++++++----
> fs/ext2/super.c | 6 ++++--
> fs/ext4/super.c | 9 ++++++---
> fs/xfs/xfs_super.c | 21 ++++++++-------------
> include/linux/dax.h | 14 --------------
> 11 files changed, 36 insertions(+), 110 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 482fe775324a4..803942586d1b6 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -108,42 +108,6 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> return dax_dev;
> }
> EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
> -
> -bool generic_fsdax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t sectors)
> -{
> - if (blocksize != PAGE_SIZE) {
> - pr_info("%pg: error: unsupported blocksize for dax\n", bdev);
> - return false;
> - }
> -
> - if (!dax_dev) {
> - pr_debug("%pg: error: dax unsupported by block device\n", bdev);
> - return false;
> - }
> -
> - return true;
> -}
> -EXPORT_SYMBOL_GPL(generic_fsdax_supported);
> -
> -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
> - int blocksize, sector_t start, sector_t len)
> -{
> - bool ret = false;
> - int id;
> -
> - if (!dax_dev)
> - return false;
> -
> - id = dax_read_lock();
> - if (dax_alive(dax_dev) && dax_dev->ops->dax_supported)
> - ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize,
> - start, len);
> - dax_read_unlock(id);
> - return ret;
> -}
> -EXPORT_SYMBOL_GPL(dax_supported);

Hooray, more dax helpers goaway!

> #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
>
> enum dax_device_flags {

<skipping to xfs part>

> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 875fd3151d6c9..3a45d5caa28d5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -331,28 +331,23 @@ xfs_set_inode_alloc(
> return xfs_is_inode32(mp) ? maxagi : agcount;
> }
>
> -static bool
> -xfs_buftarg_is_dax(
> - struct super_block *sb,
> - struct xfs_buftarg *bt)
> -{
> - return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0,
> - bdev_nr_sectors(bt->bt_bdev));
> -}
> -
> static int
> xfs_setup_dax_always(
> struct xfs_mount *mp)
> {
> - struct super_block *sb = mp->m_super;
> -
> - if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
> - (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
> + if (!mp->m_ddev_targp->bt_daxdev &&
> + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {

Nit: This ^ paren should be indented one more column because it's a
sub-clause of the if() test.

> xfs_alert(mp,
> "DAX unsupported by block device. Turning off DAX.");
> goto disable_dax;
> }
>
> + if (mp->m_super->s_blocksize != PAGE_SIZE) {
> + xfs_alert(mp,
> + "DAX not supported for blocksize. Turning off DAX.\n");

Nit: xfs_alert() already adds a newline to the end of the format string.

With those two things fixed up, for the XFS parts and everything XFS
depends on:
Reviewed-by: Darrick J. Wong <[email protected]>

(I don't feel confident saying that I've looked at dcssblk or dm-table,
though I didn't see anything obviously wrong there...)

--D

> + goto disable_dax;
> + }
> +
> if (xfs_has_reflink(mp)) {
> xfs_alert(mp, "DAX and reflink cannot be used together!");
> return -EINVAL;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index e2e9a67004cbd..439c3c70e347b 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -111,12 +111,6 @@ int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
> #if IS_ENABLED(CONFIG_FS_DAX)
> int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
> void dax_remove_host(struct gendisk *disk);
> -bool generic_fsdax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t sectors);
> -
> -bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
> - int blocksize, sector_t start, sector_t len);
>
> static inline void fs_put_dax(struct dax_device *dax_dev)
> {
> @@ -139,14 +133,6 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> static inline void dax_remove_host(struct gendisk *disk)
> {
> }
> -#define generic_fsdax_supported NULL
> -
> -static inline bool dax_supported(struct dax_device *dax_dev,
> - struct block_device *bdev, int blocksize, sector_t start,
> - sector_t len)
> -{
> - return false;
> -}
>
> static inline void fs_put_dax(struct dax_device *dax_dev)
> {
> --
> 2.30.2
>

2021-11-23 22:33:28

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 12/29] fsdax: remove a pointless __force cast in copy_cow_page_dax

On Tue, Nov 09, 2021 at 09:32:52AM +0100, Christoph Hellwig wrote:
> Despite its name copy_user_page expected kernel addresses, which is what
> we already have.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks ok,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/dax.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 4e3e5a283a916..73bd1439d8089 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -728,7 +728,7 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d
> return rc;
> }
> vto = kmap_atomic(to);
> - copy_user_page(vto, (void __force *)kaddr, vaddr, to);
> + copy_user_page(vto, kaddr, vaddr, to);
> kunmap_atomic(vto);
> dax_read_unlock(id);
> return 0;
> --
> 2.30.2
>

2021-11-23 22:33:54

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 13/29] fsdax: use a saner calling convention for copy_cow_page_dax

On Tue, Nov 09, 2021 at 09:32:53AM +0100, Christoph Hellwig wrote:
> Just pass the vm_fault and iomap_iter structures, and figure out the rest
> locally. Note that this requires moving dax_iomap_sector up in the file.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Yes, nice cleanup!
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/dax.c | 29 +++++++++++++----------------
> 1 file changed, 13 insertions(+), 16 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 73bd1439d8089..e51b4129d1b65 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -709,26 +709,31 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> return __dax_invalidate_entry(mapping, index, false);
> }
>
> -static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_dev,
> - sector_t sector, struct page *to, unsigned long vaddr)
> +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> {
> + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> +}
> +
> +static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
> +{
> + sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> void *vto, *kaddr;
> pgoff_t pgoff;
> long rc;
> int id;
>
> - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
> + rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
> if (rc)
> return rc;
>
> id = dax_read_lock();
> - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
> + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
> if (rc < 0) {
> dax_read_unlock(id);
> return rc;
> }
> - vto = kmap_atomic(to);
> - copy_user_page(vto, kaddr, vaddr, to);
> + vto = kmap_atomic(vmf->cow_page);
> + copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
> kunmap_atomic(vto);
> dax_read_unlock(id);
> return 0;
> @@ -1005,11 +1010,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
> }
> EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
>
> -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> -{
> - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> -}
> -
> static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
> pfn_t *pfnp)
> {
> @@ -1332,19 +1332,16 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
> static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
> const struct iomap_iter *iter)
> {
> - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> - unsigned long vaddr = vmf->address;
> vm_fault_t ret;
> int error = 0;
>
> switch (iter->iomap.type) {
> case IOMAP_HOLE:
> case IOMAP_UNWRITTEN:
> - clear_user_highpage(vmf->cow_page, vaddr);
> + clear_user_highpage(vmf->cow_page, vmf->address);
> break;
> case IOMAP_MAPPED:
> - error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev,
> - sector, vmf->cow_page, vaddr);
> + error = copy_cow_page_dax(vmf, iter);
> break;
> default:
> WARN_ON_ONCE(1);
> --
> 2.30.2
>

2021-11-23 22:36:44

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 14/29] fsdax: simplify the pgoff calculation

On Tue, Nov 09, 2021 at 09:32:54AM +0100, Christoph Hellwig wrote:
> Replace the two steps of dax_iomap_sector and bdev_dax_pgoff with a
> single dax_iomap_pgoff helper that avoids lots of cumbersome sector
> conversions.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> drivers/dax/super.c | 14 --------------
> fs/dax.c | 35 ++++++++++-------------------------
> include/linux/dax.h | 1 -
> 3 files changed, 10 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 803942586d1b6..c0910687fbcb2 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -67,20 +67,6 @@ void dax_remove_host(struct gendisk *disk)
> }
> EXPORT_SYMBOL_GPL(dax_remove_host);
>
> -int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
> - pgoff_t *pgoff)
> -{
> - sector_t start_sect = bdev ? get_start_sect(bdev) : 0;
> - phys_addr_t phys_off = (start_sect + sector) * 512;
> -
> - if (pgoff)
> - *pgoff = PHYS_PFN(phys_off);
> - if (phys_off % PAGE_SIZE || size % PAGE_SIZE)

AFAICT, we're relying on fs_dax_get_by_bdev to have validated this
previously, which is why the error return stuff goes away?

If so,
Reviewed-by: Darrick J. Wong <[email protected]>

--D


> - return -EINVAL;
> - return 0;
> -}
> -EXPORT_SYMBOL(bdev_dax_pgoff);
> -
> /**
> * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
> * @bdev: block device to find a dax_device for
> diff --git a/fs/dax.c b/fs/dax.c
> index e51b4129d1b65..5364549d67a48 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -709,23 +709,22 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> return __dax_invalidate_entry(mapping, index, false);
> }
>
> -static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos)
> +static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
> {
> - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
> + phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
> +
> + if (iomap->bdev)
> + paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
> + return PHYS_PFN(paddr);
> }
>
> static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
> {
> - sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos);
> + pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
> void *vto, *kaddr;
> - pgoff_t pgoff;
> long rc;
> int id;
>
> - rc = bdev_dax_pgoff(iter->iomap.bdev, sector, PAGE_SIZE, &pgoff);
> - if (rc)
> - return rc;
> -
> id = dax_read_lock();
> rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
> if (rc < 0) {
> @@ -1013,14 +1012,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size,
> pfn_t *pfnp)
> {
> - const sector_t sector = dax_iomap_sector(iomap, pos);
> - pgoff_t pgoff;
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> int id, rc;
> long length;
>
> - rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
> - if (rc)
> - return rc;
> id = dax_read_lock();
> length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
> NULL, pfnp);
> @@ -1129,7 +1124,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> {
> sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> - pgoff_t pgoff;
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> long rc, id;
> void *kaddr;
> bool page_aligned = false;
> @@ -1140,10 +1135,6 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> (size == PAGE_SIZE))
> page_aligned = true;
>
> - rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff);
> - if (rc)
> - return rc;
> -
> id = dax_read_lock();
>
> if (page_aligned)
> @@ -1169,7 +1160,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> const struct iomap *iomap = &iomi->iomap;
> loff_t length = iomap_length(iomi);
> loff_t pos = iomi->pos;
> - struct block_device *bdev = iomap->bdev;
> struct dax_device *dax_dev = iomap->dax_dev;
> loff_t end = pos + length, done = 0;
> ssize_t ret = 0;
> @@ -1203,9 +1193,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> while (pos < end) {
> unsigned offset = pos & (PAGE_SIZE - 1);
> const size_t size = ALIGN(length + offset, PAGE_SIZE);
> - const sector_t sector = dax_iomap_sector(iomap, pos);
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> ssize_t map_len;
> - pgoff_t pgoff;
> void *kaddr;
>
> if (fatal_signal_pending(current)) {
> @@ -1213,10 +1202,6 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> break;
> }
>
> - ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
> - if (ret)
> - break;
> -
> map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
> &kaddr, NULL);
> if (map_len < 0) {
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 439c3c70e347b..324363b798ecd 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -107,7 +107,6 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
> #endif
>
> struct writeback_control;
> -int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
> #if IS_ENABLED(CONFIG_FS_DAX)
> int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
> void dax_remove_host(struct gendisk *disk);
> --
> 2.30.2
>

2021-11-23 22:37:20

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 15/29] xfs: add xfs_zero_range and xfs_truncate_page helpers

On Tue, Nov 09, 2021 at 09:32:55AM +0100, Christoph Hellwig wrote:
> From: Shiyang Ruan <[email protected]>
>
> Add helpers to prepare for using different DAX operations.
>
> Signed-off-by: Shiyang Ruan <[email protected]>
> [hch: split from a larger patch + slight cleanups]
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good to me,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_bmap_util.c | 7 +++----
> fs/xfs/xfs_file.c | 3 +--
> fs/xfs/xfs_iomap.c | 25 +++++++++++++++++++++++++
> fs/xfs/xfs_iomap.h | 4 ++++
> fs/xfs/xfs_iops.c | 7 +++----
> fs/xfs/xfs_reflink.c | 3 +--
> 6 files changed, 37 insertions(+), 12 deletions(-)
>
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 73a36b7be3bd1..797ea0c8b14e1 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1001,7 +1001,7 @@ xfs_free_file_space(
>
> /*
> * Now that we've unmap all full blocks we'll have to zero out any
> - * partial block at the beginning and/or end. iomap_zero_range is smart
> + * partial block at the beginning and/or end. xfs_zero_range is smart
> * enough to skip any holes, including those we just created, but we
> * must take care not to zero beyond EOF and enlarge i_size.
> */
> @@ -1009,15 +1009,14 @@ xfs_free_file_space(
> return 0;
> if (offset + len > XFS_ISIZE(ip))
> len = XFS_ISIZE(ip) - offset;
> - error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
> - &xfs_buffered_write_iomap_ops);
> + error = xfs_zero_range(ip, offset, len, NULL);
> if (error)
> return error;
>
> /*
> * If we zeroed right up to EOF and EOF straddles a page boundary we
> * must make sure that the post-EOF area is also zeroed because the
> - * page could be mmap'd and iomap_zero_range doesn't do that for us.
> + * page could be mmap'd and xfs_zero_range doesn't do that for us.
> * Writeback of the eof page will do this, albeit clumsily.
> */
> if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 27594738b0d18..8d4c5ca261bd7 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -437,8 +437,7 @@ xfs_file_write_checks(
> }
>
> trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
> - error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
> - NULL, &xfs_buffered_write_iomap_ops);
> + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
> if (error)
> return error;
> } else
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 093758440ad53..d6d71ae9f2ae4 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1311,3 +1311,28 @@ xfs_xattr_iomap_begin(
> const struct iomap_ops xfs_xattr_iomap_ops = {
> .iomap_begin = xfs_xattr_iomap_begin,
> };
> +
> +int
> +xfs_zero_range(
> + struct xfs_inode *ip,
> + loff_t pos,
> + loff_t len,
> + bool *did_zero)
> +{
> + struct inode *inode = VFS_I(ip);
> +
> + return iomap_zero_range(inode, pos, len, did_zero,
> + &xfs_buffered_write_iomap_ops);
> +}
> +
> +int
> +xfs_truncate_page(
> + struct xfs_inode *ip,
> + loff_t pos,
> + bool *did_zero)
> +{
> + struct inode *inode = VFS_I(ip);
> +
> + return iomap_truncate_page(inode, pos, did_zero,
> + &xfs_buffered_write_iomap_ops);
> +}
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 7d3703556d0e0..f1a281ab9328c 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -20,6 +20,10 @@ xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
> int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
> struct xfs_bmbt_irec *, u16);
>
> +int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
> + bool *did_zero);
> +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
> +
> static inline xfs_filblks_t
> xfs_aligned_fsb_count(
> xfs_fileoff_t offset_fsb,
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index a607d6aca5c4d..ab5ef52b2a9ff 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -911,8 +911,8 @@ xfs_setattr_size(
> */
> if (newsize > oldsize) {
> trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
> - error = iomap_zero_range(inode, oldsize, newsize - oldsize,
> - &did_zeroing, &xfs_buffered_write_iomap_ops);
> + error = xfs_zero_range(ip, oldsize, newsize - oldsize,
> + &did_zeroing);
> } else {
> /*
> * iomap won't detect a dirty page over an unwritten block (or a
> @@ -924,8 +924,7 @@ xfs_setattr_size(
> newsize);
> if (error)
> return error;
> - error = iomap_truncate_page(inode, newsize, &did_zeroing,
> - &xfs_buffered_write_iomap_ops);
> + error = xfs_truncate_page(ip, newsize, &did_zeroing);
> }
>
> if (error)
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index cb0edb1d68ef1..facce5c076d83 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -1269,8 +1269,7 @@ xfs_reflink_zero_posteof(
> return 0;
>
> trace_xfs_zero_eof(ip, isize, pos - isize);
> - return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
> - &xfs_buffered_write_iomap_ops);
> + return xfs_zero_range(ip, isize, pos - isize, NULL);
> }
>
> /*
> --
> 2.30.2
>

2021-11-23 22:40:07

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 16/29] fsdax: simplify the offset check in dax_iomap_zero

On Tue, Nov 09, 2021 at 09:32:56AM +0100, Christoph Hellwig wrote:
> The file relative offset must have the same alignment as the storage
> offset, so use that and get rid of the call to iomap_sector.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/dax.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5364549d67a48..d7a923d152240 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1123,7 +1123,6 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>
> s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> {
> - sector_t sector = iomap_sector(iomap, pos & PAGE_MASK);
> pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> long rc, id;
> void *kaddr;
> @@ -1131,8 +1130,7 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> unsigned offset = offset_in_page(pos);
> unsigned size = min_t(u64, PAGE_SIZE - offset, length);
>
> - if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) &&
> - (size == PAGE_SIZE))
> + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> page_aligned = true;
>
> id = dax_read_lock();
> --
> 2.30.2
>

2021-11-23 22:45:00

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 17/29] fsdax: factor out a dax_memzero helper

On Tue, Nov 23, 2021 at 01:22:13PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Factor out a helper for the "manual" zeroing of a DAX range to clean
> > up dax_iomap_zero a lot.
> >
>
> Small / optional fixup below:
>
> Reviewed-by: Dan Williams <[email protected]>
>
> > Signed-off-by: Christoph Hellwig <[email protected]>
> > ---
> > fs/dax.c | 36 +++++++++++++++++++-----------------
> > 1 file changed, 19 insertions(+), 17 deletions(-)
> >
> > diff --git a/fs/dax.c b/fs/dax.c
> > index d7a923d152240..dc9ebeff850ab 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1121,34 +1121,36 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> > }
> > #endif /* CONFIG_FS_DAX_PMD */
> >
> > +static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
> > + unsigned int offset, size_t size)
> > +{
> > + void *kaddr;
> > + long rc;
> > +
> > + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
> > + if (rc >= 0) {
>
> Technically this should be "> 0" because dax_direct_access() returns
> nr_available_pages @pgoff, but this isn't broken because
> dax_direct_access() converts the "zero pages available" case into
> -ERANGE.

Agreed. With that fixed,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

>
> > + memset(kaddr + offset, 0, size);
> > + dax_flush(dax_dev, kaddr + offset, size);
> > + }
> > + return rc;
> > +}
> > +
> > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > {
> > pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> > long rc, id;
> > - void *kaddr;
> > - bool page_aligned = false;
> > unsigned offset = offset_in_page(pos);
> > unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> >
> > - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> > - page_aligned = true;
> > -
> > id = dax_read_lock();
> > -
> > - if (page_aligned)
> > + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> > rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> > else
> > - rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL);
> > - if (rc < 0) {
> > - dax_read_unlock(id);
> > - return rc;
> > - }
> > -
> > - if (!page_aligned) {
> > - memset(kaddr + offset, 0, size);
> > - dax_flush(iomap->dax_dev, kaddr + offset, size);
> > - }
> > + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> > dax_read_unlock(id);
> > +
> > + if (rc < 0)
> > + return rc;
> > return size;
> > }
> >
> > --
> > 2.30.2
> >

2021-11-23 22:53:23

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 18/29] fsdax: decouple zeroing from the iomap buffered I/O code

On Tue, Nov 09, 2021 at 09:32:58AM +0100, Christoph Hellwig wrote:
> Unshare the DAX and iomap buffered I/O page zeroing code. This code
> previously did a IS_DAX check deep inside the iomap code, which in
> fact was the only DAX check in the code. Instead move these checks
> into the callers. Most callers already have DAX special casing anyway
> and XFS will need it for reflink support as well.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 77 ++++++++++++++++++++++++++++++++++--------
> fs/ext2/inode.c | 6 ++--
> fs/ext4/inode.c | 4 +--
> fs/iomap/buffered-io.c | 35 +++++++------------
> fs/xfs/xfs_iomap.c | 6 ++++
> include/linux/dax.h | 6 +++-
> 6 files changed, 91 insertions(+), 43 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index dc9ebeff850ab..5b52b878124ac 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1135,24 +1135,73 @@ static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff,
> return rc;
> }
>
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> +static loff_t dax_zero_iter(struct iomap_iter *iter, bool *did_zero)

Shouldn't this return value remain s64 to match iomap_iter.processed?

> {
> - pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> - long rc, id;
> - unsigned offset = offset_in_page(pos);
> - unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> + const struct iomap *iomap = &iter->iomap;
> + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> + loff_t pos = iter->pos;
> + loff_t length = iomap_length(iter);

u64..

(I wonder, should iomap_iter have a debug check for iomap_length >
INT64_MAX?)

> + loff_t written = 0;

s64...

> +
> + /* already zeroed? we're done. */
> + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
> + return length;
> +
> + do {
> + unsigned offset = offset_in_page(pos);
> + unsigned size = min_t(u64, PAGE_SIZE - offset, length);
> + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
> + long rc;
> + int id;
>
> - id = dax_read_lock();
> - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> - else
> - rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> - dax_read_unlock(id);
> + id = dax_read_lock();
> + if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
> + rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
> + else
> + rc = dax_memzero(iomap->dax_dev, pgoff, offset, size);
> + dax_read_unlock(id);
>
> - if (rc < 0)
> - return rc;
> - return size;
> + if (rc < 0)
> + return rc;
> + pos += size;
> + length -= size;
> + written += size;
> + if (did_zero)
> + *did_zero = true;
> + } while (length > 0);
> +
> + return written;
> +}
> +
> +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> + const struct iomap_ops *ops)
> +{
> + struct iomap_iter iter = {
> + .inode = inode,
> + .pos = pos,
> + .len = len,
> + .flags = IOMAP_ZERO,
> + };
> + int ret;
> +
> + while ((ret = iomap_iter(&iter, ops)) > 0)
> + iter.processed = dax_zero_iter(&iter, did_zero);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(dax_zero_range);
> +
> +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> + const struct iomap_ops *ops)
> +{
> + unsigned int blocksize = i_blocksize(inode);
> + unsigned int off = pos & (blocksize - 1);
> +
> + /* Block boundary? Nothing to do */
> + if (!off)
> + return 0;
> + return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
> }
> +EXPORT_SYMBOL_GPL(dax_truncate_page);
>
> static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
> struct iov_iter *iter)
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 333fa62661d56..ae9993018a015 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -1297,9 +1297,9 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
> inode_dio_wait(inode);
>
> if (IS_DAX(inode)) {
> - error = iomap_zero_range(inode, newsize,
> - PAGE_ALIGN(newsize) - newsize, NULL,
> - &ext2_iomap_ops);
> + error = dax_zero_range(inode, newsize,
> + PAGE_ALIGN(newsize) - newsize, NULL,
> + &ext2_iomap_ops);
> } else if (test_opt(inode->i_sb, NOBH))
> error = nobh_truncate_page(inode->i_mapping,
> newsize, ext2_get_block);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 0f06305167d5a..8c443b753b815 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3783,8 +3783,8 @@ static int ext4_block_zero_page_range(handle_t *handle,
> length = max;
>
> if (IS_DAX(inode)) {
> - return iomap_zero_range(inode, from, length, NULL,
> - &ext4_iomap_ops);
> + return dax_zero_range(inode, from, length, NULL,
> + &ext4_iomap_ops);
> }
> return __ext4_block_zero_page_range(handle, mapping, from, length);
> }
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 1753c26c8e76e..b1511255b4df8 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -870,26 +870,8 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
> }
> EXPORT_SYMBOL_GPL(iomap_file_unshare);
>
> -static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
> -{
> - struct page *page;
> - int status;
> - unsigned offset = offset_in_page(pos);
> - unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
> -
> - status = iomap_write_begin(iter, pos, bytes, &page);
> - if (status)
> - return status;
> -
> - zero_user(page, offset, bytes);
> - mark_page_accessed(page);
> -
> - return iomap_write_end(iter, pos, bytes, bytes, page);
> -}
> -
> static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> {
> - struct iomap *iomap = &iter->iomap;
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> loff_t pos = iter->pos;
> loff_t length = iomap_length(iter);
> @@ -900,12 +882,19 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
> return length;
>
> do {
> - s64 bytes;
> + unsigned offset = offset_in_page(pos);
> + size_t bytes = min_t(u64, PAGE_SIZE - offset, length);
> + struct page *page;
> + int status;
>
> - if (IS_DAX(iter->inode))
> - bytes = dax_iomap_zero(pos, length, iomap);

I'm glad this kind of IS_DAX clunkiness is all going away finally.

> - else
> - bytes = __iomap_zero_iter(iter, pos, length);
> + status = iomap_write_begin(iter, pos, bytes, &page);
> + if (status)
> + return status;
> +
> + zero_user(page, offset, bytes);
> + mark_page_accessed(page);
> +
> + bytes = iomap_write_end(iter, pos, bytes, bytes, page);
> if (bytes < 0)
> return bytes;
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index d6d71ae9f2ae4..604000b6243ec 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1321,6 +1321,9 @@ xfs_zero_range(
> {
> struct inode *inode = VFS_I(ip);
>
> + if (IS_DAX(inode))
> + return dax_zero_range(inode, pos, len, did_zero,
> + &xfs_buffered_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> @@ -1333,6 +1336,9 @@ xfs_truncate_page(
> {
> struct inode *inode = VFS_I(ip);
>
> + if (IS_DAX(inode))
> + return dax_truncate_page(inode, pos, did_zero,
> + &xfs_buffered_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 324363b798ecd..a5cc2f1aa840e 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -14,6 +14,7 @@ typedef unsigned long dax_entry_t;
> struct dax_device;
> struct gendisk;
> struct iomap_ops;
> +struct iomap_iter;
> struct iomap;
>
> struct dax_operations {
> @@ -124,6 +125,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping);
> struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
> dax_entry_t dax_lock_page(struct page *page);
> void dax_unlock_page(struct page *page, dax_entry_t cookie);
> +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> + const struct iomap_ops *ops);
> +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> + const struct iomap_ops *ops);
> #else
> static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> {
> @@ -204,7 +209,6 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
> int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
> int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap);
> static inline bool dax_mapping(struct address_space *mapping)
> {
> return mapping->host && IS_DAX(mapping->host);
> --
> 2.30.2
>

2021-11-23 22:54:36

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 20/29] ext4: cleanup the dax handling in ext4_fill_super

On Tue, Nov 09, 2021 at 09:33:00AM +0100, Christoph Hellwig wrote:
> Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
> the need for the dax_dev local variable.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/ext4/super.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index eb4df43abd76e..b60401bb1c310 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3879,7 +3879,6 @@ static void ext4_setup_csum_trigger(struct super_block *sb,
>
> static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> {
> - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
> char *orig_data = kstrdup(data, GFP_KERNEL);
> struct buffer_head *bh, **group_desc;
> struct ext4_super_block *es = NULL;
> @@ -3910,12 +3909,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> if ((data && !orig_data) || !sbi)
> goto out_free_base;
>
> - sbi->s_daxdev = dax_dev;
> sbi->s_blockgroup_lock =
> kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
> if (!sbi->s_blockgroup_lock)
> goto out_free_base;
>
> + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);
> sb->s_fs_info = sbi;
> sbi->s_sb = sb;
> sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
> @@ -4300,7 +4299,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> goto failed_mount;
> }
>
> - if (dax_dev) {
> + if (sbi->s_daxdev) {
> if (blocksize == PAGE_SIZE)
> set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
> else
> @@ -5096,10 +5095,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> out_fail:
> sb->s_fs_info = NULL;
> kfree(sbi->s_blockgroup_lock);
> + fs_put_dax(sbi->s_daxdev );

Nit: no space before the paren ^ here.

With that fixed,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> out_free_base:
> kfree(sbi);
> kfree(orig_data);
> - fs_put_dax(dax_dev);
> return err ? err : ret;
> }
>
> --
> 2.30.2
>

2021-11-23 22:54:55

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 19/29] ext2: cleanup the dax handling in ext2_fill_super

On Tue, Nov 09, 2021 at 09:32:59AM +0100, Christoph Hellwig wrote:
> Only call fs_dax_get_by_bdev once the sbi has been allocated and remove
> the need for the dax_dev local variable.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/ext2/super.c | 12 +++++-------
> 1 file changed, 5 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index a964066a80aa7..7e23482862e69 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -802,7 +802,6 @@ static unsigned long descriptor_loc(struct super_block *sb,
>
> static int ext2_fill_super(struct super_block *sb, void *data, int silent)
> {
> - struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
> struct buffer_head * bh;
> struct ext2_sb_info * sbi;
> struct ext2_super_block * es;
> @@ -822,17 +821,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>
> sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
> if (!sbi)
> - goto failed;
> + return -ENOMEM;
>
> sbi->s_blockgroup_lock =
> kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
> if (!sbi->s_blockgroup_lock) {
> kfree(sbi);
> - goto failed;
> + return -ENOMEM;
> }
> sb->s_fs_info = sbi;
> sbi->s_sb_block = sb_block;
> - sbi->s_daxdev = dax_dev;
> + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev);
>
> spin_lock_init(&sbi->s_lock);
> ret = -EINVAL;
> @@ -946,7 +945,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
> blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
>
> if (test_opt(sb, DAX)) {
> - if (!dax_dev) {
> + if (!sbi->s_daxdev) {
> ext2_msg(sb, KERN_ERR,
> "DAX unsupported by block device. Turning off DAX.");
> clear_opt(sbi->s_mount_opt, DAX);
> @@ -1201,11 +1200,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
> failed_mount:
> brelse(bh);
> failed_sbi:
> + fs_put_dax(sbi->s_daxdev);
> sb->s_fs_info = NULL;
> kfree(sbi->s_blockgroup_lock);
> kfree(sbi);
> -failed:
> - fs_put_dax(dax_dev);
> return ret;
> }
>
> --
> 2.30.2
>

2021-11-23 22:55:51

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 21/29] xfs: move dax device handling into xfs_{alloc,free}_buftarg

On Tue, Nov 09, 2021 at 09:33:01AM +0100, Christoph Hellwig wrote:
> Hide the DAX device lookup from the xfs_super.c code.
>
> Reviewed-by: Christoph Hellwig <[email protected]>

This looks to be a straightforward conversion.
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_buf.c | 8 ++++----
> fs/xfs/xfs_buf.h | 4 ++--
> fs/xfs/xfs_super.c | 26 +++++---------------------
> 3 files changed, 11 insertions(+), 27 deletions(-)
>
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 631c5a61d89b7..4d4553ffa7050 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -1892,6 +1892,7 @@ xfs_free_buftarg(
> list_lru_destroy(&btp->bt_lru);
>
> blkdev_issue_flush(btp->bt_bdev);
> + fs_put_dax(btp->bt_daxdev);
>
> kmem_free(btp);
> }
> @@ -1932,11 +1933,10 @@ xfs_setsize_buftarg_early(
> return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
> }
>
> -xfs_buftarg_t *
> +struct xfs_buftarg *
> xfs_alloc_buftarg(
> struct xfs_mount *mp,
> - struct block_device *bdev,
> - struct dax_device *dax_dev)
> + struct block_device *bdev)
> {
> xfs_buftarg_t *btp;
>
> @@ -1945,7 +1945,7 @@ xfs_alloc_buftarg(
> btp->bt_mount = mp;
> btp->bt_dev = bdev->bd_dev;
> btp->bt_bdev = bdev;
> - btp->bt_daxdev = dax_dev;
> + btp->bt_daxdev = fs_dax_get_by_bdev(bdev);
>
> /*
> * Buffer IO error rate limiting. Limit it to no more than 10 messages
> diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
> index 6b0200b8007d1..bd7f709f0d232 100644
> --- a/fs/xfs/xfs_buf.h
> +++ b/fs/xfs/xfs_buf.h
> @@ -338,8 +338,8 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
> /*
> * Handling of buftargs.
> */
> -extern struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *,
> - struct block_device *, struct dax_device *);
> +struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
> + struct block_device *bdev);
> extern void xfs_free_buftarg(struct xfs_buftarg *);
> extern void xfs_buftarg_wait(struct xfs_buftarg *);
> extern void xfs_buftarg_drain(struct xfs_buftarg *);
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 3a45d5caa28d5..7262716afb215 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -391,26 +391,19 @@ STATIC void
> xfs_close_devices(
> struct xfs_mount *mp)
> {
> - struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
> -
> if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
> struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
> - struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
>
> xfs_free_buftarg(mp->m_logdev_targp);
> xfs_blkdev_put(logdev);
> - fs_put_dax(dax_logdev);
> }
> if (mp->m_rtdev_targp) {
> struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
> - struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
>
> xfs_free_buftarg(mp->m_rtdev_targp);
> xfs_blkdev_put(rtdev);
> - fs_put_dax(dax_rtdev);
> }
> xfs_free_buftarg(mp->m_ddev_targp);
> - fs_put_dax(dax_ddev);
> }
>
> /*
> @@ -428,8 +421,6 @@ xfs_open_devices(
> struct xfs_mount *mp)
> {
> struct block_device *ddev = mp->m_super->s_bdev;
> - struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
> - struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
> struct block_device *logdev = NULL, *rtdev = NULL;
> int error;
>
> @@ -439,8 +430,7 @@ xfs_open_devices(
> if (mp->m_logname) {
> error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
> if (error)
> - goto out;
> - dax_logdev = fs_dax_get_by_bdev(logdev);
> + return error;
> }
>
> if (mp->m_rtname) {
> @@ -454,25 +444,24 @@ xfs_open_devices(
> error = -EINVAL;
> goto out_close_rtdev;
> }
> - dax_rtdev = fs_dax_get_by_bdev(rtdev);
> }
>
> /*
> * Setup xfs_mount buffer target pointers
> */
> error = -ENOMEM;
> - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
> + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
> if (!mp->m_ddev_targp)
> goto out_close_rtdev;
>
> if (rtdev) {
> - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
> + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
> if (!mp->m_rtdev_targp)
> goto out_free_ddev_targ;
> }
>
> if (logdev && logdev != ddev) {
> - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
> + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
> if (!mp->m_logdev_targp)
> goto out_free_rtdev_targ;
> } else {
> @@ -488,14 +477,9 @@ xfs_open_devices(
> xfs_free_buftarg(mp->m_ddev_targp);
> out_close_rtdev:
> xfs_blkdev_put(rtdev);
> - fs_put_dax(dax_rtdev);
> out_close_logdev:
> - if (logdev && logdev != ddev) {
> + if (logdev && logdev != ddev)
> xfs_blkdev_put(logdev);
> - fs_put_dax(dax_logdev);
> - }
> - out:
> - fs_put_dax(dax_ddev);
> return error;
> }
>
> --
> 2.30.2
>

2021-11-23 23:00:25

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 22/29] iomap: add a IOMAP_DAX flag

On Tue, Nov 09, 2021 at 09:33:02AM +0100, Christoph Hellwig wrote:
> Add a flag so that the file system can easily detect DAX operations.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 7 ++++---
> include/linux/iomap.h | 1 +
> 2 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b52b878124ac..0bd6cdcbacfc4 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1180,7 +1180,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> .inode = inode,
> .pos = pos,
> .len = len,
> - .flags = IOMAP_ZERO,
> + .flags = IOMAP_DAX | IOMAP_ZERO,
> };
> int ret;
>
> @@ -1308,6 +1308,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
> .inode = iocb->ki_filp->f_mapping->host,
> .pos = iocb->ki_pos,
> .len = iov_iter_count(iter),
> + .flags = IOMAP_DAX,
> };
> loff_t done = 0;
> int ret;
> @@ -1461,7 +1462,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> .inode = mapping->host,
> .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
> .len = PAGE_SIZE,
> - .flags = IOMAP_FAULT,
> + .flags = IOMAP_DAX | IOMAP_FAULT,
> };
> vm_fault_t ret = 0;
> void *entry;
> @@ -1570,7 +1571,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
> struct iomap_iter iter = {
> .inode = mapping->host,
> .len = PMD_SIZE,
> - .flags = IOMAP_FAULT,
> + .flags = IOMAP_DAX | IOMAP_FAULT,
> };
> vm_fault_t ret = VM_FAULT_FALLBACK;
> pgoff_t max_pgoff;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6d1b08d0ae930..146a7e3e3ea11 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -141,6 +141,7 @@ struct iomap_page_ops {
> #define IOMAP_NOWAIT (1 << 5) /* do not block */
> #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
> #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */
> +#define IOMAP_DAX (1 << 8) /* DAX mapping */

Should this be #define'd to 0 ifndef CONFIG_FS_DAX so that the compiler
will optimize out all the IOMAP_DAX bits if dax isn't enabled in
Kconfig? Kind of like what we do for S_DAX?

--D

>
> struct iomap_ops {
> /*
> --
> 2.30.2
>

2021-11-23 23:01:29

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 23/29] xfs: use IOMAP_DAX to check for DAX mappings

On Tue, Nov 09, 2021 at 09:33:03AM +0100, Christoph Hellwig wrote:
> Use the explicit DAX flag instead of checking the inode flag in the
> iomap code.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Any particular reason to pass this in as a flag vs. querying the inode?

Doesn't really bother me either way, was just curious.
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_iomap.c | 7 ++++---
> fs/xfs/xfs_iomap.h | 3 ++-
> fs/xfs/xfs_pnfs.c | 2 +-
> 3 files changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 604000b6243ec..8cef3b68cba78 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -188,6 +188,7 @@ xfs_iomap_write_direct(
> struct xfs_inode *ip,
> xfs_fileoff_t offset_fsb,
> xfs_fileoff_t count_fsb,
> + unsigned int flags,
> struct xfs_bmbt_irec *imap)
> {
> struct xfs_mount *mp = ip->i_mount;
> @@ -229,7 +230,7 @@ xfs_iomap_write_direct(
> * the reserve block pool for bmbt block allocation if there is no space
> * left but we need to do unwritten extent conversion.
> */
> - if (IS_DAX(VFS_I(ip))) {
> + if (flags & IOMAP_DAX) {
> bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
> if (imap->br_state == XFS_EXT_UNWRITTEN) {
> force = true;
> @@ -620,7 +621,7 @@ imap_needs_alloc(
> imap->br_startblock == DELAYSTARTBLOCK)
> return true;
> /* we convert unwritten extents before copying the data for DAX */
> - if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
> + if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN)
> return true;
> return false;
> }
> @@ -826,7 +827,7 @@ xfs_direct_write_iomap_begin(
> xfs_iunlock(ip, lockmode);
>
> error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
> - &imap);
> + flags, &imap);
> if (error)
> return error;
>
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index f1a281ab9328c..5648262a71736 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -12,7 +12,8 @@ struct xfs_inode;
> struct xfs_bmbt_irec;
>
> int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
> - xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
> + xfs_fileoff_t count_fsb, unsigned int flags,
> + struct xfs_bmbt_irec *imap);
> int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
> xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
> xfs_fileoff_t end_fsb);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index 5e1d29d8b2e73..e188e1cf97cc5 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -155,7 +155,7 @@ xfs_fs_map_blocks(
> xfs_iunlock(ip, lock_flags);
>
> error = xfs_iomap_write_direct(ip, offset_fsb,
> - end_fsb - offset_fsb, &imap);
> + end_fsb - offset_fsb, 0, &imap);
> if (error)
> goto out_unlock;
>
> --
> 2.30.2
>

2021-11-23 23:02:24

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 24/29] xfs: use xfs_direct_write_iomap_ops for DAX zeroing

On Tue, Nov 09, 2021 at 09:33:04AM +0100, Christoph Hellwig wrote:
> While the buffered write iomap ops do work due to the fact that zeroing
> never allocates blocks, the DAX zeroing should use the direct ops just
> like actual DAX I/O.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Heh. I've wanted to fix this for a long time, but I like your
surrounding cleanups better than anything I had time to work on. :)

Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_iomap.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 8cef3b68cba78..704292c6ce0c7 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1324,7 +1324,7 @@ xfs_zero_range(
>
> if (IS_DAX(inode))
> return dax_zero_range(inode, pos, len, did_zero,
> - &xfs_buffered_write_iomap_ops);
> + &xfs_direct_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> @@ -1339,7 +1339,7 @@ xfs_truncate_page(
>
> if (IS_DAX(inode))
> return dax_truncate_page(inode, pos, did_zero,
> - &xfs_buffered_write_iomap_ops);
> + &xfs_direct_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> --
> 2.30.2
>

2021-11-23 23:11:48

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 26/29] fsdax: shift partition offset handling into the file systems

On Tue, Nov 09, 2021 at 09:33:06AM +0100, Christoph Hellwig wrote:
> Remove the last user of ->bdev in dax.c by requiring the file system to
> pass in an address that already includes the DAX offset. As part of the
> only set ->bdev or ->daxdev when actually required in the ->iomap_begin

As part of the ... ?

"...impending disentanglement of block_device and dax_device"?

Which I assume is why we make filesystems know about partition offsets
now?

> methods.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 6 +-----
> fs/erofs/data.c | 11 ++++++++--
> fs/erofs/internal.h | 1 +
> fs/ext2/inode.c | 8 +++++--
> fs/ext4/inode.c | 16 +++++++++-----
> fs/xfs/libxfs/xfs_bmap.c | 4 ++--
> fs/xfs/xfs_aops.c | 2 +-
> fs/xfs/xfs_iomap.c | 45 +++++++++++++++++++++++++---------------
> fs/xfs/xfs_iomap.h | 5 +++--
> fs/xfs/xfs_pnfs.c | 2 +-
> 10 files changed, 63 insertions(+), 37 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 0bd6cdcbacfc4..2c13c681edf09 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -711,11 +711,7 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
>
> static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
> {
> - phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
> -
> - if (iomap->bdev)
> - paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
> - return PHYS_PFN(paddr);
> + return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
> }
>
> static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)

<skip to the xfs part, the ext* parts look ok and I didn't look at erofs>

> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 4dccd4d90622d..74198dd82b035 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -4551,7 +4551,7 @@ xfs_bmapi_convert_delalloc(
> * the extent. Just return the real extent at this offset.
> */
> if (!isnullstartblock(bma.got.br_startblock)) {
> - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
> + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> *seq = READ_ONCE(ifp->if_seq);
> goto out_trans_cancel;
> }
> @@ -4598,7 +4598,7 @@ xfs_bmapi_convert_delalloc(
> XFS_STATS_INC(mp, xs_xstrat_quick);
>
> ASSERT(!isnullstartblock(bma.got.br_startblock));
> - xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
> + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
> *seq = READ_ONCE(ifp->if_seq);
>
> if (whichfork == XFS_COW_FORK)
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index c8c15c3c31471..6ac3449a68ba0 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -359,7 +359,7 @@ xfs_map_blocks(
> isnullstartblock(imap.br_startblock))
> goto allocate_blocks;
>
> - xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
> + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
> trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
> return 0;
> allocate_blocks:
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 704292c6ce0c7..74dbf1fd99d39 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap(
> struct xfs_inode *ip,
> struct iomap *iomap,
> struct xfs_bmbt_irec *imap,
> - u16 flags)
> + unsigned int flags,
> + u16 iomap_flags)

The argument names confused me. Do @flags contains IOMAP_$FOO flags,
whereas @iomap_flags contains IOMAP_F_$FOO flags? Can these be changed
to "unsigned int iomap_flags" and "u16 iomap_f_flags" to make the flags
domain more obvious, please?

I'd also take "u16 mapping_flags" for the last parameter.

--D

> {
> struct xfs_mount *mp = ip->i_mount;
> struct xfs_buftarg *target = xfs_inode_buftarg(ip);
> @@ -71,16 +72,22 @@ xfs_bmbt_to_iomap(
> iomap->type = IOMAP_DELALLOC;
> } else {
> iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock));
> + if (flags & IOMAP_DAX)
> + iomap->addr += target->bt_dax_part_off;
> +
> if (imap->br_state == XFS_EXT_UNWRITTEN)
> iomap->type = IOMAP_UNWRITTEN;
> else
> iomap->type = IOMAP_MAPPED;
> +
> }
> iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
> iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
> - iomap->bdev = target->bt_bdev;
> - iomap->dax_dev = target->bt_daxdev;
> - iomap->flags = flags;
> + if (flags & IOMAP_DAX)
> + iomap->dax_dev = target->bt_daxdev;
> + else
> + iomap->bdev = target->bt_bdev;
> + iomap->flags = iomap_flags;
>
> if (xfs_ipincount(ip) &&
> (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
> @@ -801,7 +808,7 @@ xfs_direct_write_iomap_begin(
>
> xfs_iunlock(ip, lockmode);
> trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
>
> allocate_blocks:
> error = -EAGAIN;
> @@ -832,18 +839,19 @@ xfs_direct_write_iomap_begin(
> return error;
>
> trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> + iomap_flags | IOMAP_F_NEW);
>
> out_found_cow:
> xfs_iunlock(ip, lockmode);
> length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> if (imap.br_startblock != HOLESTARTBLOCK) {
> - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
> + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> if (error)
> return error;
> }
> - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
> + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
>
> out_unlock:
> if (lockmode)
> @@ -1053,23 +1061,24 @@ xfs_buffered_write_iomap_begin(
> */
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
>
> found_imap:
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
>
> found_cow:
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> if (imap.br_startoff <= offset_fsb) {
> - error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
> + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
> if (error)
> return error;
> - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
> + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> + IOMAP_F_SHARED);
> }
>
> xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
> - return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
> + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
>
> out_unlock:
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> @@ -1178,7 +1187,8 @@ xfs_read_iomap_begin(
> if (error)
> return error;
> trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
> + shared ? IOMAP_F_SHARED : 0);
> }
>
> const struct iomap_ops xfs_read_iomap_ops = {
> @@ -1237,7 +1247,8 @@ xfs_seek_iomap_begin(
> if (data_fsb < cow_fsb + cmap.br_blockcount)
> end_fsb = min(end_fsb, data_fsb);
> xfs_trim_extent(&cmap, offset_fsb, end_fsb);
> - error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
> + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
> + IOMAP_F_SHARED);
> /*
> * This is a COW extent, so we must probe the page cache
> * because there could be dirty page cache being backed
> @@ -1259,7 +1270,7 @@ xfs_seek_iomap_begin(
> imap.br_state = XFS_EXT_NORM;
> done:
> xfs_trim_extent(&imap, offset_fsb, end_fsb);
> - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
> + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> out_unlock:
> xfs_iunlock(ip, lockmode);
> return error;
> @@ -1306,7 +1317,7 @@ xfs_xattr_iomap_begin(
> if (error)
> return error;
> ASSERT(nimaps);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
> }
>
> const struct iomap_ops xfs_xattr_iomap_ops = {
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index 5648262a71736..fe7a625361d95 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -18,8 +18,9 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
> xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
> xfs_fileoff_t end_fsb);
>
> -int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
> - struct xfs_bmbt_irec *, u16);
> +int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
> + struct xfs_bmbt_irec *imap, unsigned int flags,
> + u16 iomap_flags);
>
> int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
> bool *did_zero);
> diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
> index e188e1cf97cc5..d6334abbc0b3e 100644
> --- a/fs/xfs/xfs_pnfs.c
> +++ b/fs/xfs/xfs_pnfs.c
> @@ -173,7 +173,7 @@ xfs_fs_map_blocks(
> }
> xfs_iunlock(ip, XFS_IOLOCK_EXCL);
>
> - error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
> + error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
> *device_generation = mp->m_generation;
> return error;
> out_unlock:
> --
> 2.30.2
>

2021-11-23 23:13:10

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 27/29] dax: fix up some of the block device related ifdefs

On Tue, Nov 09, 2021 at 09:33:07AM +0100, Christoph Hellwig wrote:
> The DAX device <-> block device association is only enabled if
> CONFIG_BLOCK is enabled. Update dax.h to account for that and use
> the right conditions for the fs_put_dax stub as well.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks ok,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> include/linux/dax.h | 41 ++++++++++++++++++++---------------------
> 1 file changed, 20 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 90f95deff504d..5568d3dca941b 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -108,28 +108,15 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
> #endif
>
> struct writeback_control;
> -#if IS_ENABLED(CONFIG_FS_DAX)
> +#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
> int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
> void dax_remove_host(struct gendisk *disk);
> -
> +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
> + u64 *start_off);
> static inline void fs_put_dax(struct dax_device *dax_dev)
> {
> put_dax(dax_dev);
> }
> -
> -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
> - u64 *start_off);
> -int dax_writeback_mapping_range(struct address_space *mapping,
> - struct dax_device *dax_dev, struct writeback_control *wbc);
> -
> -struct page *dax_layout_busy_page(struct address_space *mapping);
> -struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
> -dax_entry_t dax_lock_page(struct page *page);
> -void dax_unlock_page(struct page *page, dax_entry_t cookie);
> -int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> - const struct iomap_ops *ops);
> -int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> - const struct iomap_ops *ops);
> #else
> static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> {
> @@ -138,17 +125,29 @@ static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
> static inline void dax_remove_host(struct gendisk *disk)
> {
> }
> -
> -static inline void fs_put_dax(struct dax_device *dax_dev)
> -{
> -}
> -
> static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
> u64 *start_off)
> {
> return NULL;
> }
> +static inline void fs_put_dax(struct dax_device *dax_dev)
> +{
> +}
> +#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
> +
> +#if IS_ENABLED(CONFIG_FS_DAX)
> +int dax_writeback_mapping_range(struct address_space *mapping,
> + struct dax_device *dax_dev, struct writeback_control *wbc);
>
> +struct page *dax_layout_busy_page(struct address_space *mapping);
> +struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
> +dax_entry_t dax_lock_page(struct page *page);
> +void dax_unlock_page(struct page *page, dax_entry_t cookie);
> +int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> + const struct iomap_ops *ops);
> +int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
> + const struct iomap_ops *ops);
> +#else
> static inline struct page *dax_layout_busy_page(struct address_space *mapping)
> {
> return NULL;
> --
> 2.30.2
>

2021-11-23 23:13:29

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 28/29] iomap: build the block based code conditionally

On Tue, Nov 09, 2021 at 09:33:08AM +0100, Christoph Hellwig wrote:
> Only build the block based iomap code if CONFIG_BLOCK is set. Currently
> that is always the case, but it will change soon.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks ok,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/Kconfig | 4 ++--
> fs/iomap/Makefile | 4 ++--
> 2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/fs/Kconfig b/fs/Kconfig
> index a6313a969bc5f..6d608330a096e 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER
> Enable this to perform validation of the parameter description for a
> filesystem when it is registered.
>
> -if BLOCK
> -
> config FS_IOMAP
> bool
>
> +if BLOCK
> +
> source "fs/ext2/Kconfig"
> source "fs/ext4/Kconfig"
> source "fs/jbd2/Kconfig"
> diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
> index 4143a3ff89dbc..fc070184b7faa 100644
> --- a/fs/iomap/Makefile
> +++ b/fs/iomap/Makefile
> @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events
> obj-$(CONFIG_FS_IOMAP) += iomap.o
>
> iomap-y += trace.o \
> - buffered-io.o \
> + iter.o
> +iomap-$(CONFIG_BLOCK) += buffered-io.o \
> direct-io.o \
> fiemap.o \
> - iter.o \
> seek.o
> iomap-$(CONFIG_SWAP) += swapfile.o
> --
> 2.30.2
>

2021-11-23 23:13:46

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 29/29] fsdax: don't require CONFIG_BLOCK

On Tue, Nov 09, 2021 at 09:33:09AM +0100, Christoph Hellwig wrote:
> The file system DAX code now does not require the block code. So allow
> building a kernel with fuse DAX but not block layer.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/Kconfig | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/Kconfig b/fs/Kconfig
> index 6d608330a096e..7a2b11c0b8036 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -42,6 +42,8 @@ source "fs/nilfs2/Kconfig"
> source "fs/f2fs/Kconfig"
> source "fs/zonefs/Kconfig"
>
> +endif # BLOCK
> +
> config FS_DAX
> bool "File system based Direct Access (DAX) support"
> depends on MMU
> @@ -89,8 +91,6 @@ config FS_DAX_PMD
> config FS_DAX_LIMITED
> bool
>
> -endif # BLOCK
> -
> # Posix ACL utility routines
> #
> # Note: Posix ACLs can be implemented without these helpers. Never use
> --
> 2.30.2
>

2021-11-24 02:41:01

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 21/29] xfs: move dax device handling into xfs_{alloc,free}_buftarg

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Hide the DAX device lookup from the xfs_super.c code.
>
> Reviewed-by: Christoph Hellwig <[email protected]>

That's an interesting spelling of "Signed-off-by", but patch looks
good to me too. I would have expected a robot to complain about
missing sign-off?

Reviewed-by: Dan Williams <[email protected]>

2021-11-24 02:47:26

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 22/29] iomap: add a IOMAP_DAX flag

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Add a flag so that the file system can easily detect DAX operations.

Looks ok, but I would have preferred a quick note about the rationale
here before needing to read other patches to figure that out.

If you add that you can add:

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 7 ++++---
> include/linux/iomap.h | 1 +
> 2 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 5b52b878124ac..0bd6cdcbacfc4 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1180,7 +1180,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
> .inode = inode,
> .pos = pos,
> .len = len,
> - .flags = IOMAP_ZERO,
> + .flags = IOMAP_DAX | IOMAP_ZERO,
> };
> int ret;
>
> @@ -1308,6 +1308,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
> .inode = iocb->ki_filp->f_mapping->host,
> .pos = iocb->ki_pos,
> .len = iov_iter_count(iter),
> + .flags = IOMAP_DAX,
> };
> loff_t done = 0;
> int ret;
> @@ -1461,7 +1462,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
> .inode = mapping->host,
> .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
> .len = PAGE_SIZE,
> - .flags = IOMAP_FAULT,
> + .flags = IOMAP_DAX | IOMAP_FAULT,
> };
> vm_fault_t ret = 0;
> void *entry;
> @@ -1570,7 +1571,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
> struct iomap_iter iter = {
> .inode = mapping->host,
> .len = PMD_SIZE,
> - .flags = IOMAP_FAULT,
> + .flags = IOMAP_DAX | IOMAP_FAULT,
> };
> vm_fault_t ret = VM_FAULT_FALLBACK;
> pgoff_t max_pgoff;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 6d1b08d0ae930..146a7e3e3ea11 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -141,6 +141,7 @@ struct iomap_page_ops {
> #define IOMAP_NOWAIT (1 << 5) /* do not block */
> #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */
> #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */
> +#define IOMAP_DAX (1 << 8) /* DAX mapping */
>
> struct iomap_ops {
> /*
> --
> 2.30.2
>

2021-11-24 02:49:28

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 23/29] xfs: use IOMAP_DAX to check for DAX mappings

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Use the explicit DAX flag instead of checking the inode flag in the
> iomap code.

It's not immediately clear to me why this is a net benefit, are you
anticipating inode-less operations? With reflink and multi-inode
operations a single iomap flag seems insufficient, no?

2021-11-24 02:52:27

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 24/29] xfs: use xfs_direct_write_iomap_ops for DAX zeroing

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> While the buffered write iomap ops do work due to the fact that zeroing
> never allocates blocks, the DAX zeroing should use the direct ops just
> like actual DAX I/O.
>

I always wondered about this, change looks good to me.

Reviewed-by: Dan Williams <[email protected]>

> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/xfs_iomap.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 8cef3b68cba78..704292c6ce0c7 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1324,7 +1324,7 @@ xfs_zero_range(
>
> if (IS_DAX(inode))
> return dax_zero_range(inode, pos, len, did_zero,
> - &xfs_buffered_write_iomap_ops);
> + &xfs_direct_write_iomap_ops);
> return iomap_zero_range(inode, pos, len, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> @@ -1339,7 +1339,7 @@ xfs_truncate_page(
>
> if (IS_DAX(inode))
> return dax_truncate_page(inode, pos, did_zero,
> - &xfs_buffered_write_iomap_ops);
> + &xfs_direct_write_iomap_ops);
> return iomap_truncate_page(inode, pos, did_zero,
> &xfs_buffered_write_iomap_ops);
> }
> --
> 2.30.2
>

2021-11-24 02:56:41

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 25/29] dax: return the partition offset from fs_dax_get_by_bdev

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Prepare from removing the block_device from the DAX I/O path by returning

s/from removing/for the removal of/

> the partition offset from fs_dax_get_by_bdev so that the file systems
> have it at hand for use during I/O.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> drivers/dax/super.c | 9 ++++++---
> drivers/md/dm.c | 4 ++--
> fs/erofs/internal.h | 2 ++
> fs/erofs/super.c | 4 ++--
> fs/ext2/ext2.h | 1 +
> fs/ext2/super.c | 2 +-
> fs/ext4/ext4.h | 1 +
> fs/ext4/super.c | 2 +-
> fs/xfs/xfs_buf.c | 2 +-
> fs/xfs/xfs_buf.h | 1 +
> include/linux/dax.h | 6 ++++--
> 11 files changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index c0910687fbcb2..cc32dcf71c116 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -70,17 +70,20 @@ EXPORT_SYMBOL_GPL(dax_remove_host);
> /**
> * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
> * @bdev: block device to find a dax_device for
> + * @start_off: returns the byte offset into the dax_device that @bdev starts
> */
> -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
> +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
> {
> struct dax_device *dax_dev;
> + u64 part_size;
> int id;
>
> if (!blk_queue_dax(bdev->bd_disk->queue))
> return NULL;
>
> - if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
> - (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {
> + *start_off = get_start_sect(bdev) * SECTOR_SIZE;
> + part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE;
> + if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) {
> pr_info("%pg: error: unaligned partition for dax\n", bdev);
> return NULL;
> }
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 282008afc465f..5ea6115d19bdc 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -637,7 +637,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
> struct mapped_device *md)
> {
> struct block_device *bdev;
> -
> + u64 part_off;
> int r;
>
> BUG_ON(td->dm_dev.bdev);
> @@ -653,7 +653,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
> }
>
> td->dm_dev.bdev = bdev;
> - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev);
> + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);

Perhaps allow NULL as an argument for callers that do not care about
the start offset?


Otherwise, looks good / clever.

Reviewed-by: Dan Williams <[email protected]>

2021-11-24 03:05:17

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 21/29] xfs: move dax device handling into xfs_{alloc,free}_buftarg

On Tue, Nov 23, 2021 at 06:40:47PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Hide the DAX device lookup from the xfs_super.c code.
> >
> > Reviewed-by: Christoph Hellwig <[email protected]>
>
> That's an interesting spelling of "Signed-off-by", but patch looks
> good to me too. I would have expected a robot to complain about
> missing sign-off?

Nah, they only like to do that /after/ you've pushed a branch to
kernel.org and emailed the lists about it. ;)

--D

> Reviewed-by: Dan Williams <[email protected]>

2021-11-24 03:22:00

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 26/29] fsdax: shift partition offset handling into the file systems

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Remove the last user of ->bdev in dax.c by requiring the file system to
> pass in an address that already includes the DAX offset. As part of the
> only set ->bdev or ->daxdev when actually required in the ->iomap_begin
> methods.

Changes look good except for what looks like an argument position
fixup needed for an xfs_bmbt_to_iomap() caller below...

>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 6 +-----
> fs/erofs/data.c | 11 ++++++++--
> fs/erofs/internal.h | 1 +
> fs/ext2/inode.c | 8 +++++--
> fs/ext4/inode.c | 16 +++++++++-----
> fs/xfs/libxfs/xfs_bmap.c | 4 ++--
> fs/xfs/xfs_aops.c | 2 +-
> fs/xfs/xfs_iomap.c | 45 +++++++++++++++++++++++++---------------
> fs/xfs/xfs_iomap.h | 5 +++--
> fs/xfs/xfs_pnfs.c | 2 +-
> 10 files changed, 63 insertions(+), 37 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 0bd6cdcbacfc4..2c13c681edf09 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -711,11 +711,7 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
>
> static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
> {
> - phys_addr_t paddr = iomap->addr + (pos & PAGE_MASK) - iomap->offset;
> -
> - if (iomap->bdev)
> - paddr += (get_start_sect(iomap->bdev) << SECTOR_SHIFT);
> - return PHYS_PFN(paddr);
> + return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
> }
>
> static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
> diff --git a/fs/erofs/data.c b/fs/erofs/data.c
> index 0e35ef3f9f3d7..9b1bb177ce303 100644
> --- a/fs/erofs/data.c
> +++ b/fs/erofs/data.c
[..]
}
> @@ -215,9 +218,13 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
> if (ret)
> return ret;
>
> - iomap->bdev = mdev.m_bdev;
> - iomap->dax_dev = mdev.m_daxdev;
> iomap->offset = map.m_la;
> + if (flags & IOMAP_DAX) {
> + iomap->dax_dev = mdev.m_daxdev;
> + iomap->offset += mdev.m_dax_part_off;
> + } else {
> + iomap->bdev = mdev.m_bdev;
> + }

Ah, that's what IOMAP_DAX is for, to stop making iomap carry bdev
details unnecessarily.

[..]
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 704292c6ce0c7..74dbf1fd99d39 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -54,7 +54,8 @@ xfs_bmbt_to_iomap(
> struct xfs_inode *ip,
> struct iomap *iomap,
> struct xfs_bmbt_irec *imap,
> - u16 flags)
> + unsigned int flags,
> + u16 iomap_flags)

It would be nice if the compiler could help with making sure that
right 'flags' values are passed to the right 'flags' parameter, but I
can't think of

[..]
> @@ -1053,23 +1061,24 @@ xfs_buffered_write_iomap_begin(
> */
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
>
> found_imap:
> xfs_iunlock(ip, XFS_ILOCK_EXCL);
> - return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
> + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);

The iomap flags are supposed to be the last argument, right?

2021-11-24 03:44:21

by Gao Xiang

[permalink] [raw]
Subject: Re: [PATCH 26/29] fsdax: shift partition offset handling into the file systems

On Tue, Nov 09, 2021 at 09:33:06AM +0100, Christoph Hellwig wrote:
> Remove the last user of ->bdev in dax.c by requiring the file system to
> pass in an address that already includes the DAX offset. As part of the
> only set ->bdev or ->daxdev when actually required in the ->iomap_begin
> methods.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/dax.c | 6 +-----
> fs/erofs/data.c | 11 ++++++++--
> fs/erofs/internal.h | 1 +

For erofs part, it looks good to me,
Reviewed-by: Gao Xiang <[email protected]>

Thanks,
Gao Xiang

2021-11-24 03:48:06

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 27/29] dax: fix up some of the block device related ifdefs

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> The DAX device <-> block device association is only enabled if
> CONFIG_BLOCK is enabled. Update dax.h to account for that and use
> the right conditions for the fs_put_dax stub as well.

Looks good to me.

Reviewed-by: Dan Williams <[email protected]>

>
> Signed-off-by: Christoph Hellwig <[email protected]>

2021-11-24 03:51:26

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 28/29] iomap: build the block based code conditionally

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> Only build the block based iomap code if CONFIG_BLOCK is set. Currently
> that is always the case, but it will change soon.

Looks good.

Reviewed-by: Dan Williams <[email protected]>

2021-11-24 03:52:24

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 29/29] fsdax: don't require CONFIG_BLOCK

On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
>
> The file system DAX code now does not require the block code. So allow
> building a kernel with fuse DAX but not block layer.

Looks good to me.

Reviewed-by: Dan Williams <[email protected]>

2021-11-24 06:36:11

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 06/29] dax: move the partition alignment check into fs_dax_get_by_bdev

On Tue, Nov 23, 2021 at 02:25:55PM -0800, Darrick J. Wong wrote:
> > + if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
> > + (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {
>
> Do we have to be careful about 64-bit division here, or do we not
> support DAX on 32-bit?

I can't find anything in the Kconfig limiting DAX to 32-bit. But
then again the existing code has divisions like this, so the compiler
is probably smart enough to turn them into shifts.

> > + pr_info("%pg: error: unaligned partition for dax\n", bdev);
>
> I also wonder if this should be ratelimited...?

This happens once (or maybe three times for XFS with rt and log devices)
at mount time, so I see no need for a ratelimit.

2021-11-24 06:37:39

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 08/29] dax: remove dax_capable

On Tue, Nov 23, 2021 at 02:31:23PM -0800, Darrick J. Wong wrote:
> > - struct super_block *sb = mp->m_super;
> > -
> > - if (!xfs_buftarg_is_dax(sb, mp->m_ddev_targp) &&
> > - (!mp->m_rtdev_targp || !xfs_buftarg_is_dax(sb, mp->m_rtdev_targp))) {
> > + if (!mp->m_ddev_targp->bt_daxdev &&
> > + (!mp->m_rtdev_targp || !mp->m_rtdev_targp->bt_daxdev)) {
>
> Nit: This ^ paren should be indented one more column because it's a
> sub-clause of the if() test.

Done.

> Nit: xfs_alert() already adds a newline to the end of the format string.

Already done in the current tree.

2021-11-24 06:39:40

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 14/29] fsdax: simplify the pgoff calculation

On Tue, Nov 23, 2021 at 02:36:42PM -0800, Darrick J. Wong wrote:
> > - phys_addr_t phys_off = (start_sect + sector) * 512;
> > -
> > - if (pgoff)
> > - *pgoff = PHYS_PFN(phys_off);
> > - if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
>
> AFAICT, we're relying on fs_dax_get_by_bdev to have validated this
> previously, which is why the error return stuff goes away?

Exactly.

2021-11-24 06:47:51

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 17/29] fsdax: factor out a dax_memzero helper

On Tue, Nov 23, 2021 at 01:22:13PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Factor out a helper for the "manual" zeroing of a DAX range to clean
> > up dax_iomap_zero a lot.
> >
>
> Small / optional fixup below:

Incorporated.

2021-11-24 06:50:56

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 18/29] fsdax: decouple zeroing from the iomap buffered I/O code

On Tue, Nov 23, 2021 at 01:46:35PM -0800, Dan Williams wrote:
> > + const struct iomap_ops *ops)
> > +{
> > + unsigned int blocksize = i_blocksize(inode);
> > + unsigned int off = pos & (blocksize - 1);
> > +
> > + /* Block boundary? Nothing to do */
> > + if (!off)
> > + return 0;
>
> It took me a moment to figure out why this was correct. I see it was
> also copied from iomap_truncate_page(). It makes sense for DAX where
> blocksize >= PAGE_SIZE so it's always the case that the amount of
> capacity to zero relative to a page is from @pos to the end of the
> block. Is there something else that protects the blocksize < PAGE_SIZE
> case outside of DAX?
>
> Nothing to change for this patch, just a question I had while reviewing.

This is a helper for truncate ->setattr, where everything outside the
block is deallocated. So zeroing is only needed inside the block.

2021-11-24 06:52:33

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 18/29] fsdax: decouple zeroing from the iomap buffered I/O code

On Tue, Nov 23, 2021 at 02:53:15PM -0800, Darrick J. Wong wrote:
> > -s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap)
> > +static loff_t dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
>
> Shouldn't this return value remain s64 to match iomap_iter.processed?

I'll switch it over. Given that loff_t is always the same as s64
it shouldn't really matter.

(same for the others)

2021-11-24 06:53:57

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 20/29] ext4: cleanup the dax handling in ext4_fill_super

On Tue, Nov 23, 2021 at 02:54:30PM -0800, Darrick J. Wong wrote:
> Nit: no space before the paren ^ here.

Fixed.

2021-11-24 06:55:08

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 21/29] xfs: move dax device handling into xfs_{alloc,free}_buftarg

On Tue, Nov 23, 2021 at 06:40:47PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Hide the DAX device lookup from the xfs_super.c code.
> >
> > Reviewed-by: Christoph Hellwig <[email protected]>
>
> That's an interesting spelling of "Signed-off-by", but patch looks
> good to me too. I would have expected a robot to complain about
> missing sign-off?

Hah. I'll fix it up.

2021-11-24 06:59:42

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 25/29] dax: return the partition offset from fs_dax_get_by_bdev

On Tue, Nov 23, 2021 at 06:56:29PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Prepare from removing the block_device from the DAX I/O path by returning
>
> s/from removing/for the removal of/

Fixed.

> > td->dm_dev.bdev = bdev;
> > - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev);
> > + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
>
> Perhaps allow NULL as an argument for callers that do not care about
> the start offset?

All callers currently care, dm just has another way to get at the
information. So for now I'd like to not add the NULL special case,
but we can reconsider that as needed if/when more callers show up.

2021-11-24 07:10:35

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 22/29] iomap: add a IOMAP_DAX flag

On Tue, Nov 23, 2021 at 06:47:10PM -0800, Dan Williams wrote:
> On Tue, Nov 9, 2021 at 12:34 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Add a flag so that the file system can easily detect DAX operations.
>
> Looks ok, but I would have preferred a quick note about the rationale
> here before needing to read other patches to figure that out.

The reason is to only apply the DAX partition offsets to actual DAX
operations, and not to e.g. fiemap. I'll document that more clearly.

2021-11-24 07:14:35

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 23/29] xfs: use IOMAP_DAX to check for DAX mappings

On Tue, Nov 23, 2021 at 03:01:24PM -0800, Darrick J. Wong wrote:
> On Tue, Nov 09, 2021 at 09:33:03AM +0100, Christoph Hellwig wrote:
> > Use the explicit DAX flag instead of checking the inode flag in the
> > iomap code.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
>
> Any particular reason to pass this in as a flag vs. querying the inode?

Same reason as the addition of IOMAP_DAX. But I think I'll redo this
a bit to do the XFS paramater passing first and then actually check
IOMAP_DAX together with introducing it to make it all a little more clear.