2023-08-02 16:38:35

by Christoph Hellwig

[permalink] [raw]
Subject: more blkdev_get and holder work

Hi all,

this series sits on top of the vfs.super branch in the VFS tree and does a
few closely related things:

1) it also converts nilfs2 and btrfs to the new scheme where the file system
only opens the block devices after we know that a new super_block was
allocated.
2) it then makes sure that for all file system openers the super_block is
stored in bd_holder, and makes use of that fact in the mark_dead method
so that it doesn't have to fall get_super and thus can also work on
block devices that sb->s_bdev doesn't point to
3) it then drops the fs-specific holder ops in ext4 and xfs and uses the
generic fs_holder_ops there

A git tree is available here:

git://git.infradead.org/users/hch/misc.git fs-holder-rework

Gitweb:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/fs-holder-rework

Diffstat:
fs/btrfs/super.c | 67 ++++++++++++++++---------------------
fs/btrfs/volumes.c | 8 ++--
fs/btrfs/volumes.h | 2 -
fs/ext4/super.c | 18 +++-------
fs/f2fs/super.c | 7 +--
fs/nilfs2/super.c | 81 ++++++++++++++++-----------------------------
fs/super.c | 44 ++++++++++++++++++------
fs/xfs/xfs_super.c | 32 +++++++----------
include/linux/blkdev.h | 2 +
include/linux/fs_context.h | 2 +
10 files changed, 126 insertions(+), 137 deletions(-)


2023-08-02 16:43:05

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 06/12] fs: use the super_block as holder when mounting file systems

The file system type is not a very useful holder as it doesn't allow us
to go back to the actual file system instance. Pass the super_block instead
which is useful when passed back to the file system driver.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/btrfs/super.c | 7 ++-----
fs/f2fs/super.c | 7 +++----
fs/super.c | 8 ++++----
3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5980b5dcc6b163..8a47c7f2690880 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,8 +69,6 @@ static const struct super_operations btrfs_super_ops;
* requested by subvol=/path. That way the callchain is straightforward and we
* don't have to play tricks with the mount options and recursive calls to
* btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
*/
static struct file_system_type btrfs_fs_type;
static struct file_system_type btrfs_root_fs_type;
@@ -1498,8 +1496,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
} else {
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;

- error = btrfs_open_devices(fs_devices, sb_open_mode(flags),
- fs_type);
+ error = btrfs_open_devices(fs_devices, sb_open_mode(flags), s);
if (error)
goto out_deactivate;

@@ -1513,7 +1510,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
fs_devices->latest_dev->bdev);
shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
+ fs_info->bdev_holder = s;
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ca31163da00a55..05c90fdb7a6cca 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1561,7 +1561,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
int i;

for (i = 0; i < sbi->s_ndevs; i++) {
- blkdev_put(FDEV(i).bdev, sbi->sb->s_type);
+ blkdev_put(FDEV(i).bdev, sbi->sb);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -4198,7 +4198,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
/* Single zoned block device mount */
FDEV(0).bdev =
blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode,
- sbi->sb->s_type, NULL);
+ sbi->sb, NULL);
} else {
/* Multi-device mount */
memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
@@ -4217,8 +4217,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->log_blocks_per_seg) - 1;
}
FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode,
- sbi->sb->s_type,
- NULL);
+ sbi->sb, NULL);
}
if (IS_ERR(FDEV(i).bdev))
return PTR_ERR(FDEV(i).bdev);
diff --git a/fs/super.c b/fs/super.c
index 6aaa275fa8630d..09b65ee1a8b737 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1249,7 +1249,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
blk_mode_t mode = sb_open_mode(sb_flags);
struct block_device *bdev;

- bdev = blkdev_get_by_dev(sb->s_dev, mode, sb->s_type, &fs_holder_ops);
+ bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
if (IS_ERR(bdev)) {
if (fc)
errorf(fc, "%s: Can't open blockdev", fc->source);
@@ -1262,7 +1262,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
* writable from userspace even for a read-only block device.
*/
if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, sb->s_type);
+ blkdev_put(bdev, sb);
return -EACCES;
}

@@ -1278,7 +1278,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
mutex_unlock(&bdev->bd_fsfreeze_mutex);
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev, sb->s_type);
+ blkdev_put(bdev, sb);
return -EBUSY;
}
spin_lock(&sb_lock);
@@ -1418,7 +1418,7 @@ void kill_block_super(struct super_block *sb)
if (bdev) {
bdev->bd_super = NULL;
sync_blockdev(bdev);
- blkdev_put(bdev, sb->s_type);
+ blkdev_put(bdev, sb);
}
}

--
2.39.2


2023-08-02 16:43:10

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 08/12] fs: export fs_holder_ops

Export fs_holder_ops so that file systems that open additional block
devices can use it as well.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/super.c | 3 ++-
include/linux/blkdev.h | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index 0cda4af0a7e16c..dac05f96ab9ac8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1244,9 +1244,10 @@ static void fs_mark_dead(struct block_device *bdev)
up_read(&sb->s_umount);
}

-static const struct blk_holder_ops fs_holder_ops = {
+const struct blk_holder_ops fs_holder_ops = {
.mark_dead = fs_mark_dead,
};
+EXPORT_SYMBOL_GPL(fs_holder_ops);

static int set_bdev_super(struct super_block *s, void *data)
{
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed44a997f629f5..83262702eea71a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1464,6 +1464,8 @@ struct blk_holder_ops {
void (*mark_dead)(struct block_device *bdev);
};

+extern const struct blk_holder_ops fs_holder_ops;
+
/*
* Return the correct open flags for blkdev_get_by_* for super block flags
* as stored in sb->s_flags.
--
2.39.2


2023-08-02 16:50:14

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 12/12] xfs use fs_holder_ops for the log and RT devices

Use the generic fs_holder_ops to shut down the file system when the
log or RT device goes away instead of duplicating the logic.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/xfs/xfs_super.c | 17 +++--------------
1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d5042419ed9997..338eba71ff8667 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -377,17 +377,6 @@ xfs_setup_dax_always(
return 0;
}

-static void
-xfs_bdev_mark_dead(
- struct block_device *bdev)
-{
- xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
-}
-
-static const struct blk_holder_ops xfs_holder_ops = {
- .mark_dead = xfs_bdev_mark_dead,
-};
-
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -396,8 +385,8 @@ xfs_blkdev_get(
{
int error = 0;

- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
- &xfs_holder_ops);
+ *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdevp)) {
error = PTR_ERR(*bdevp);
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
@@ -412,7 +401,7 @@ xfs_blkdev_put(
struct block_device *bdev)
{
if (bdev)
- blkdev_put(bdev, mp);
+ blkdev_put(bdev, mp->m_super);
}

STATIC void
--
2.39.2


2023-08-02 16:51:04

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 04/12] btrfs: open block devices after superblock creation

Currently btrfs_mount_root opens the block devices before committing to
allocating a super block. That creates problems for restricting the
number of writers to a device, and also leads to a unusual and not very
helpful holder (the fs_type).

Reorganize the code to first look whether the superblock for a
particular fsid does already exist and open the block devices only if it
doesn't, mirror the recent changes to the VFS mount helpers.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/btrfs/super.c | 51 ++++++++++++++++++++++------------------------
fs/btrfs/volumes.c | 4 ++--
2 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b318bddefd5236..5980b5dcc6b163 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1434,10 +1434,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
int flags, const char *device_name, void *data)
{
- struct block_device *bdev = NULL;
struct super_block *s;
struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
void *new_sec_opts = NULL;
int error = 0;
@@ -1483,35 +1481,36 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
error = PTR_ERR(device);
goto error_fs_info;
}
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, sb_open_mode(flags), fs_type);
+ fs_info->fs_devices = device->fs_devices;
mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }

- bdev = fs_devices->latest_dev->bdev;
s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
- goto error_close_devices;
+ goto error_fs_info;
}

if (s->s_root) {
- btrfs_close_devices(fs_devices);
btrfs_free_fs_info(fs_info);
if ((flags ^ s->s_flags) & SB_RDONLY)
error = -EBUSY;
} else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+
+ error = btrfs_open_devices(fs_devices, sb_open_mode(flags),
+ fs_type);
+ if (error)
+ goto out_deactivate;
+
+ if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ error = -EACCES;
+ btrfs_close_devices(fs_info->fs_devices);
+ goto out_deactivate;
+ }
+
+ snprintf(s->s_id, sizeof(s->s_id), "%pg",
+ fs_devices->latest_dev->bdev);
shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
s->s_id);
btrfs_sb(s)->bdev_holder = fs_type;
@@ -1519,21 +1518,19 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
}
if (!error)
error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
+ if (error)
+ goto out_deactivate;
security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
return dget(s->s_root);

-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
+out_deactivate:
+ deactivate_locked_super(s);
error_sec_opts:
security_free_mnt_opts(&new_sec_opts);
return ERR_PTR(error);
+error_fs_info:
+ btrfs_free_fs_info(fs_info);
+ goto error_sec_opts;
}

/*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8246578c70f55b..88e9daae5e74ed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1269,7 +1269,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
{
int ret;

- lockdep_assert_held(&uuid_mutex);
/*
* The device_list_mutex cannot be taken here in case opening the
* underlying device takes further locks like open_mutex.
@@ -1277,7 +1276,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
* We also don't need the lock here as this is called during mount and
* exclusion is provided by uuid_mutex
*/
-
+ mutex_lock(&uuid_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1285,6 +1284,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
+ mutex_unlock(&uuid_mutex);

return ret;
}
--
2.39.2


2023-08-02 16:51:54

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

Use the generic setup_bdev_super helper to open the main block device
and do various bits of superblock setup instead of duplicating the
logic. This includes moving to the new scheme implemented in common
code that only opens the block device after the superblock has allocated.

It does not yet convert nilfs2 to the new mount API, but doing so will
become a bit simpler after this first step.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/nilfs2/super.c | 81 ++++++++++++++++++-----------------------------
1 file changed, 30 insertions(+), 51 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0ef8c71bde8e5f..a5d1fa4e7552f6 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -35,6 +35,7 @@
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
@@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
}

struct nilfs_super_data {
- struct block_device *bdev;
__u64 cno;
int flags;
};
@@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)

static int nilfs_set_bdev_super(struct super_block *s, void *data)
{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
+ s->s_dev = *(dev_t *)data;
return 0;
}

static int nilfs_test_bdev_super(struct super_block *s, void *data)
{
- return (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}

static struct dentry *
nilfs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
- struct nilfs_super_data sd;
+ struct nilfs_super_data sd = { .flags = flags };
struct super_block *s;
- struct dentry *root_dentry;
- int err, s_new = false;
+ dev_t dev;
+ int err;

- sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
- NULL);
- if (IS_ERR(sd.bdev))
- return ERR_CAST(sd.bdev);
+ if (nilfs_identify(data, &sd))
+ return ERR_PTR(-EINVAL);

- sd.cno = 0;
- sd.flags = flags;
- if (nilfs_identify((char *)data, &sd)) {
- err = -EINVAL;
- goto failed;
- }
+ err = lookup_bdev(dev_name, &dev);
+ if (err)
+ return ERR_PTR(err);

- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
- if (sd.bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- err = -EBUSY;
- goto failed;
- }
s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
- sd.bdev);
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- err = PTR_ERR(s);
- goto failed;
- }
+ &dev);
+ if (IS_ERR(s))
+ return ERR_CAST(s);

if (!s->s_root) {
- s_new = true;
-
- /* New superblock instance created */
- snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
- sb_set_blocksize(s, block_size(sd.bdev));
-
- err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ /*
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * __invalidate_device()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
+ */
+ up_write(&s->s_umount);
+ err = setup_bdev_super(s, flags, NULL);
+ down_write(&s->s_umount);
+ if (!err)
+ err = nilfs_fill_super(s, data,
+ flags & SB_SILENT ? 1 : 0);
if (err)
goto failed_super;

@@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}

if (sd.cno) {
+ struct dentry *root_dentry;
+
err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
if (err)
goto failed_super;
- } else {
- root_dentry = dget(s->s_root);
+ return root_dentry;
}

- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
-
- return root_dentry;
+ return dget(s->s_root);

failed_super:
deactivate_locked_super(s);
-
- failed:
- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
return ERR_PTR(err);
}

--
2.39.2


2023-08-02 17:00:24

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 12/12] xfs use fs_holder_ops for the log and RT devices

On Wed, Aug 02, 2023 at 05:41:31PM +0200, Christoph Hellwig wrote:
> Use the generic fs_holder_ops to shut down the file system when the
> log or RT device goes away instead of duplicating the logic.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Nice cleanup,
Reviewed-by: Darrick J. Wong <[email protected]>

--D

> ---
> fs/xfs/xfs_super.c | 17 +++--------------
> 1 file changed, 3 insertions(+), 14 deletions(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d5042419ed9997..338eba71ff8667 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -377,17 +377,6 @@ xfs_setup_dax_always(
> return 0;
> }
>
> -static void
> -xfs_bdev_mark_dead(
> - struct block_device *bdev)
> -{
> - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
> -}
> -
> -static const struct blk_holder_ops xfs_holder_ops = {
> - .mark_dead = xfs_bdev_mark_dead,
> -};
> -
> STATIC int
> xfs_blkdev_get(
> xfs_mount_t *mp,
> @@ -396,8 +385,8 @@ xfs_blkdev_get(
> {
> int error = 0;
>
> - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
> - &xfs_holder_ops);
> + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
> + mp->m_super, &fs_holder_ops);
> if (IS_ERR(*bdevp)) {
> error = PTR_ERR(*bdevp);
> xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
> @@ -412,7 +401,7 @@ xfs_blkdev_put(
> struct block_device *bdev)
> {
> if (bdev)
> - blkdev_put(bdev, mp);
> + blkdev_put(bdev, mp->m_super);
> }
>
> STATIC void
> --
> 2.39.2
>

2023-08-02 17:28:28

by Christoph Hellwig

[permalink] [raw]
Subject: [PATCH 09/12] ext4: drop s_umount over opening the log device

Just like get_tree_bdev needs to drop s_umount when opening the main
device, we need to do the same for the ext4 log device to avoid a
potential lock order reversal with s_unmount for the mark_dead path.

It might be preferable to just drop s_umount over ->fill_super entirely,
but that will require a fairly massive audit first, so we'll do the easy
version here first.

Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/ext4/super.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 193d665813b611..2ccb19d345c6dd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5854,7 +5854,10 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
return NULL;

+ /* see get_tree_bdev why this is needed and safe */
+ up_write(&sb->s_umount);
bdev = ext4_blkdev_get(j_dev, sb);
+ down_write(&sb->s_umount);
if (bdev == NULL)
return NULL;

--
2.39.2


2023-08-03 12:18:27

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 06/12] fs: use the super_block as holder when mounting file systems

On Wed 02-08-23 17:41:25, Christoph Hellwig wrote:
> The file system type is not a very useful holder as it doesn't allow us
> to go back to the actual file system instance. Pass the super_block instead
> which is useful when passed back to the file system driver.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Nice, this is what I also wanted to eventually do :). Feel free to add:

Reviewed-by: Jan Kara <[email protected]>

Honza

> ---
> fs/btrfs/super.c | 7 ++-----
> fs/f2fs/super.c | 7 +++----
> fs/super.c | 8 ++++----
> 3 files changed, 9 insertions(+), 13 deletions(-)
>
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 5980b5dcc6b163..8a47c7f2690880 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -69,8 +69,6 @@ static const struct super_operations btrfs_super_ops;
> * requested by subvol=/path. That way the callchain is straightforward and we
> * don't have to play tricks with the mount options and recursive calls to
> * btrfs_mount.
> - *
> - * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
> */
> static struct file_system_type btrfs_fs_type;
> static struct file_system_type btrfs_root_fs_type;
> @@ -1498,8 +1496,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
> } else {
> struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>
> - error = btrfs_open_devices(fs_devices, sb_open_mode(flags),
> - fs_type);
> + error = btrfs_open_devices(fs_devices, sb_open_mode(flags), s);
> if (error)
> goto out_deactivate;
>
> @@ -1513,7 +1510,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
> fs_devices->latest_dev->bdev);
> shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name,
> s->s_id);
> - btrfs_sb(s)->bdev_holder = fs_type;
> + fs_info->bdev_holder = s;
> error = btrfs_fill_super(s, fs_devices, data);
> }
> if (!error)
> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
> index ca31163da00a55..05c90fdb7a6cca 100644
> --- a/fs/f2fs/super.c
> +++ b/fs/f2fs/super.c
> @@ -1561,7 +1561,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
> int i;
>
> for (i = 0; i < sbi->s_ndevs; i++) {
> - blkdev_put(FDEV(i).bdev, sbi->sb->s_type);
> + blkdev_put(FDEV(i).bdev, sbi->sb);
> #ifdef CONFIG_BLK_DEV_ZONED
> kvfree(FDEV(i).blkz_seq);
> #endif
> @@ -4198,7 +4198,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
> /* Single zoned block device mount */
> FDEV(0).bdev =
> blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode,
> - sbi->sb->s_type, NULL);
> + sbi->sb, NULL);
> } else {
> /* Multi-device mount */
> memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
> @@ -4217,8 +4217,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
> sbi->log_blocks_per_seg) - 1;
> }
> FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode,
> - sbi->sb->s_type,
> - NULL);
> + sbi->sb, NULL);
> }
> if (IS_ERR(FDEV(i).bdev))
> return PTR_ERR(FDEV(i).bdev);
> diff --git a/fs/super.c b/fs/super.c
> index 6aaa275fa8630d..09b65ee1a8b737 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1249,7 +1249,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> blk_mode_t mode = sb_open_mode(sb_flags);
> struct block_device *bdev;
>
> - bdev = blkdev_get_by_dev(sb->s_dev, mode, sb->s_type, &fs_holder_ops);
> + bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
> if (IS_ERR(bdev)) {
> if (fc)
> errorf(fc, "%s: Can't open blockdev", fc->source);
> @@ -1262,7 +1262,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> * writable from userspace even for a read-only block device.
> */
> if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
> - blkdev_put(bdev, sb->s_type);
> + blkdev_put(bdev, sb);
> return -EACCES;
> }
>
> @@ -1278,7 +1278,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
> mutex_unlock(&bdev->bd_fsfreeze_mutex);
> if (fc)
> warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
> - blkdev_put(bdev, sb->s_type);
> + blkdev_put(bdev, sb);
> return -EBUSY;
> }
> spin_lock(&sb_lock);
> @@ -1418,7 +1418,7 @@ void kill_block_super(struct super_block *sb)
> if (bdev) {
> bdev->bd_super = NULL;
> sync_blockdev(bdev);
> - blkdev_put(bdev, sb->s_type);
> + blkdev_put(bdev, sb);
> }
> }
>
> --
> 2.39.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-03 13:04:17

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Wed 02-08-23 17:41:21, Christoph Hellwig wrote:
> Use the generic setup_bdev_super helper to open the main block device
> and do various bits of superblock setup instead of duplicating the
> logic. This includes moving to the new scheme implemented in common
> code that only opens the block device after the superblock has allocated.
>
> It does not yet convert nilfs2 to the new mount API, but doing so will
> become a bit simpler after this first step.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

AFAICS nilfs2 could *almost* use mount_bdev() directly and then just do its
snapshot thing after mount_bdev() returns. But it has this weird logic
that: "if the superblock is already mounted but we can shrink the whole
dcache, then do remount instead of ignoring mount options". Firstly, this
looks racy - what prevents someone from say opening a file on the sb just
after nilfs_tree_is_busy() shrinks dcache? Secondly, it is inconsistent
with any other filesystem so it's going to surprise sysadmins not
intimately knowing nilfs2. Thirdly, from userspace you cannot tell what
your mount call is going to do. Last but not least, what is it really good
for? Ryusuke, can you explain please?

Honza

> ---
> fs/nilfs2/super.c | 81 ++++++++++++++++++-----------------------------
> 1 file changed, 30 insertions(+), 51 deletions(-)
>
> diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
> index 0ef8c71bde8e5f..a5d1fa4e7552f6 100644
> --- a/fs/nilfs2/super.c
> +++ b/fs/nilfs2/super.c
> @@ -35,6 +35,7 @@
> #include <linux/writeback.h>
> #include <linux/seq_file.h>
> #include <linux/mount.h>
> +#include <linux/fs_context.h>
> #include "nilfs.h"
> #include "export.h"
> #include "mdt.h"
> @@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
> }
>
> struct nilfs_super_data {
> - struct block_device *bdev;
> __u64 cno;
> int flags;
> };
> @@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
>
> static int nilfs_set_bdev_super(struct super_block *s, void *data)
> {
> - s->s_bdev = data;
> - s->s_dev = s->s_bdev->bd_dev;
> + s->s_dev = *(dev_t *)data;
> return 0;
> }
>
> static int nilfs_test_bdev_super(struct super_block *s, void *data)
> {
> - return (void *)s->s_bdev == data;
> + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
> }
>
> static struct dentry *
> nilfs_mount(struct file_system_type *fs_type, int flags,
> const char *dev_name, void *data)
> {
> - struct nilfs_super_data sd;
> + struct nilfs_super_data sd = { .flags = flags };
> struct super_block *s;
> - struct dentry *root_dentry;
> - int err, s_new = false;
> + dev_t dev;
> + int err;
>
> - sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
> - NULL);
> - if (IS_ERR(sd.bdev))
> - return ERR_CAST(sd.bdev);
> + if (nilfs_identify(data, &sd))
> + return ERR_PTR(-EINVAL);
>
> - sd.cno = 0;
> - sd.flags = flags;
> - if (nilfs_identify((char *)data, &sd)) {
> - err = -EINVAL;
> - goto failed;
> - }
> + err = lookup_bdev(dev_name, &dev);
> + if (err)
> + return ERR_PTR(err);
>
> - /*
> - * once the super is inserted into the list by sget, s_umount
> - * will protect the lockfs code from trying to start a snapshot
> - * while we are mounting
> - */
> - mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
> - if (sd.bdev->bd_fsfreeze_count > 0) {
> - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
> - err = -EBUSY;
> - goto failed;
> - }
> s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
> - sd.bdev);
> - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
> - if (IS_ERR(s)) {
> - err = PTR_ERR(s);
> - goto failed;
> - }
> + &dev);
> + if (IS_ERR(s))
> + return ERR_CAST(s);
>
> if (!s->s_root) {
> - s_new = true;
> -
> - /* New superblock instance created */
> - snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
> - sb_set_blocksize(s, block_size(sd.bdev));
> -
> - err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
> + /*
> + * We drop s_umount here because we need to open the bdev and
> + * bdev->open_mutex ranks above s_umount (blkdev_put() ->
> + * __invalidate_device()). It is safe because we have active sb
> + * reference and SB_BORN is not set yet.
> + */
> + up_write(&s->s_umount);
> + err = setup_bdev_super(s, flags, NULL);
> + down_write(&s->s_umount);
> + if (!err)
> + err = nilfs_fill_super(s, data,
> + flags & SB_SILENT ? 1 : 0);
> if (err)
> goto failed_super;
>
> @@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
> }
>
> if (sd.cno) {
> + struct dentry *root_dentry;
> +
> err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
> if (err)
> goto failed_super;
> - } else {
> - root_dentry = dget(s->s_root);
> + return root_dentry;
> }
>
> - if (!s_new)
> - blkdev_put(sd.bdev, fs_type);
> -
> - return root_dentry;
> + return dget(s->s_root);
>
> failed_super:
> deactivate_locked_super(s);
> -
> - failed:
> - if (!s_new)
> - blkdev_put(sd.bdev, fs_type);
> return ERR_PTR(err);
> }
>
> --
> 2.39.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-03 14:09:25

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 08/12] fs: export fs_holder_ops

On Wed 02-08-23 17:41:27, Christoph Hellwig wrote:
> Export fs_holder_ops so that file systems that open additional block
> devices can use it as well.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <[email protected]>

Honza

> ---
> fs/super.c | 3 ++-
> include/linux/blkdev.h | 2 ++
> 2 files changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/fs/super.c b/fs/super.c
> index 0cda4af0a7e16c..dac05f96ab9ac8 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -1244,9 +1244,10 @@ static void fs_mark_dead(struct block_device *bdev)
> up_read(&sb->s_umount);
> }
>
> -static const struct blk_holder_ops fs_holder_ops = {
> +const struct blk_holder_ops fs_holder_ops = {
> .mark_dead = fs_mark_dead,
> };
> +EXPORT_SYMBOL_GPL(fs_holder_ops);
>
> static int set_bdev_super(struct super_block *s, void *data)
> {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index ed44a997f629f5..83262702eea71a 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1464,6 +1464,8 @@ struct blk_holder_ops {
> void (*mark_dead)(struct block_device *bdev);
> };
>
> +extern const struct blk_holder_ops fs_holder_ops;
> +
> /*
> * Return the correct open flags for blkdev_get_by_* for super block flags
> * as stored in sb->s_flags.
> --
> 2.39.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-03 14:35:00

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 06/12] fs: use the super_block as holder when mounting file systems

On Thu 03-08-23 13:51:31, Jan Kara wrote:
> On Wed 02-08-23 17:41:25, Christoph Hellwig wrote:
> > The file system type is not a very useful holder as it doesn't allow us
> > to go back to the actual file system instance. Pass the super_block instead
> > which is useful when passed back to the file system driver.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
>
> Nice, this is what I also wanted to eventually do :). Feel free to add:
>
> Reviewed-by: Jan Kara <[email protected]>

As a side note, after this patch we can also remove bdev->bd_super and
transition the two real users (mark_buffer_write_io_error() and two places
in ocfs2) to use bd_holder. Ext4 also uses bd_super but there it is really
pointless as we have the superblock directly available in that function
anyway.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-03 14:46:41

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 09/12] ext4: drop s_umount over opening the log device

On Wed 02-08-23 17:41:28, Christoph Hellwig wrote:
> Just like get_tree_bdev needs to drop s_umount when opening the main
> device, we need to do the same for the ext4 log device to avoid a
> potential lock order reversal with s_unmount for the mark_dead path.
>
> It might be preferable to just drop s_umount over ->fill_super entirely,
> but that will require a fairly massive audit first, so we'll do the easy
> version here first.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <[email protected]>

Honza

> ---
> fs/ext4/super.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 193d665813b611..2ccb19d345c6dd 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -5854,7 +5854,10 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
> if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
> return NULL;
>
> + /* see get_tree_bdev why this is needed and safe */
> + up_write(&sb->s_umount);
> bdev = ext4_blkdev_get(j_dev, sb);
> + down_write(&sb->s_umount);
> if (bdev == NULL)
> return NULL;
>
> --
> 2.39.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-03 18:29:10

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 08/12] fs: export fs_holder_ops

On Wed, Aug 02, 2023 at 05:41:27PM +0200, Christoph Hellwig wrote:
> Export fs_holder_ops so that file systems that open additional block
> devices can use it as well.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <[email protected]>

2023-08-03 18:30:31

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 06/12] fs: use the super_block as holder when mounting file systems

On Wed, Aug 02, 2023 at 05:41:25PM +0200, Christoph Hellwig wrote:
> The file system type is not a very useful holder as it doesn't allow us
> to go back to the actual file system instance. Pass the super_block instead
> which is useful when passed back to the file system driver.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <[email protected]>

2023-08-03 18:49:58

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH 09/12] ext4: drop s_umount over opening the log device

On Wed, Aug 02, 2023 at 05:41:28PM +0200, Christoph Hellwig wrote:
> Just like get_tree_bdev needs to drop s_umount when opening the main
> device, we need to do the same for the ext4 log device to avoid a
> potential lock order reversal with s_unmount for the mark_dead path.
>
> It might be preferable to just drop s_umount over ->fill_super entirely,
> but that will require a fairly massive audit first, so we'll do the easy
> version here first.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---

Looks good to me,
Reviewed-by: Christian Brauner <[email protected]>

2023-08-04 02:27:39

by Ryusuke Konishi

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Thu, Aug 3, 2023 at 8:46 PM Jan Kara wrote:
>
> On Wed 02-08-23 17:41:21, Christoph Hellwig wrote:
> > Use the generic setup_bdev_super helper to open the main block device
> > and do various bits of superblock setup instead of duplicating the
> > logic. This includes moving to the new scheme implemented in common
> > code that only opens the block device after the superblock has allocated.
> >
> > It does not yet convert nilfs2 to the new mount API, but doing so will
> > become a bit simpler after this first step.
> >
> > Signed-off-by: Christoph Hellwig <[email protected]>
>
> AFAICS nilfs2 could *almost* use mount_bdev() directly and then just do its

> snapshot thing after mount_bdev() returns. But it has this weird logic
> that: "if the superblock is already mounted but we can shrink the whole
> dcache, then do remount instead of ignoring mount options". Firstly, this
> looks racy - what prevents someone from say opening a file on the sb just
> after nilfs_tree_is_busy() shrinks dcache? Secondly, it is inconsistent
> with any other filesystem so it's going to surprise sysadmins not
> intimately knowing nilfs2. Thirdly, from userspace you cannot tell what
> your mount call is going to do. Last but not least, what is it really good
> for? Ryusuke, can you explain please?
>
> Honza

I think you are referring to the following part:

> if (!s->s_root) {
...
> } else if (!sd.cno) {
> if (nilfs_tree_is_busy(s->s_root)) {
> if ((flags ^ s->s_flags) & SB_RDONLY) {
> nilfs_err(s,
> "the device already has a %s mount.",
> sb_rdonly(s) ? "read-only" : "read/write");
> err = -EBUSY;
> goto failed_super;
> }
> } else {
> /*
> * Try remount to setup mount states if the current
> * tree is not mounted and only snapshots use this sb.
> */
> err = nilfs_remount(s, &flags, data);
> if (err)
> goto failed_super;
> }
> }

What this logic is trying to do is, if there is already a nilfs2 mount
instance for the device, and are trying to mounting the current tree
(sd.cno is 0, so this is not a snapshot mount), then will switch
depending on whether the current tree has a mount:

- If the current tree is mounted, it's just like a normal filesystem.
(A read-only mount and a read/write mount can't coexist, so check
that, and reuse the instance if possible)
- Otherwise, i.e. for snapshot mounts only, do whatever is necessary
to add a new current mount, such as starting a log writer.
Since it does the same thing that nilfs_remount does, so
nilfs_remount() is used there.

Whether or not there is a current tree mount can be determined by
d_count(s->s_root) > 1 as nilfs_tree_is_busy() does.
Where s->s_root is always the root dentry of the current tree, not
that of the mounted snapshot.

I remember that calling shrink_dcache_parent() before this test was to
do the test correctly if there was garbage left in the dcache from the
past current mount.

If the current tree isn't mounted, it just cleans up the garbage, and
the reference count wouldn't have incremented in parallel.

If the current tree is mounted, d_count(s->s_root) will not decrease
to 1, so it's not a problem.
However, this will cause unexpected dcache shrinkage for the in-use
tree, so it's not a good idea, as you pointed out. If there is
another way of judging without this side effect, it should be
replaced.

I will reply here once.

Regards,
Ryusuke Konishi

2023-08-04 06:02:34

by Ryusuke Konishi

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Thu, Aug 3, 2023 at 12:41 AM Christoph Hellwig wrote:
>
> Use the generic setup_bdev_super helper to open the main block device
> and do various bits of superblock setup instead of duplicating the
> logic. This includes moving to the new scheme implemented in common
> code that only opens the block device after the superblock has allocated.
>
> It does not yet convert nilfs2 to the new mount API, but doing so will
> become a bit simpler after this first step.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Acked-by: Ryusuke Konishi <[email protected]>

This patch itself looks to properly convert nilfs_mount etc. Thank you so much.

Regards,
Ryusuke Konishi

2023-08-04 15:58:30

by Christian Brauner

[permalink] [raw]
Subject: Re: more blkdev_get and holder work

On Wed, 02 Aug 2023 17:41:19 +0200, Christoph Hellwig wrote:
> this series sits on top of the vfs.super branch in the VFS tree and does a
> few closely related things:
>
> 1) it also converts nilfs2 and btrfs to the new scheme where the file system
> only opens the block devices after we know that a new super_block was
> allocated.
> 2) it then makes sure that for all file system openers the super_block is
> stored in bd_holder, and makes use of that fact in the mark_dead method
> so that it doesn't have to fall get_super and thus can also work on
> block devices that sb->s_bdev doesn't point to
> 3) it then drops the fs-specific holder ops in ext4 and xfs and uses the
> generic fs_holder_ops there
>
> [...]

Let's pick this up now so it still has ample time in -next even though
we're still missing a nod from the btrfs people. The nilfs to
mount_bdev() conversion is probably not super urgent but if wanted a
follow-up patch won't be frowned upon.

---

Applied to the vfs.super branch of the vfs/vfs.git tree.
Patches in the vfs.super branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs.super

[01/12] fs: export setup_bdev_super
https://git.kernel.org/vfs/vfs/c/71c00ec51d83
[02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code
https://git.kernel.org/vfs/vfs/c/c820df38784a
[03/12] btrfs: always open the device read-only in btrfs_scan_one_device
https://git.kernel.org/vfs/vfs/c/75029e14cea6
[04/12] btrfs: open block devices after superblock creation
https://git.kernel.org/vfs/vfs/c/364820697dbb
[05/12] ext4: make the IS_EXT2_SB/IS_EXT3_SB checks more robust
https://git.kernel.org/vfs/vfs/c/4cf66c030db1
[06/12] fs: use the super_block as holder when mounting file systems
https://git.kernel.org/vfs/vfs/c/c0188baf8f7e
[07/12] fs: stop using get_super in fs_mark_dead
https://git.kernel.org/vfs/vfs/c/2a8402f9db25
[08/12] fs: export fs_holder_ops
https://git.kernel.org/vfs/vfs/c/ee62b0ec9ff8
[09/12] ext4: drop s_umount over opening the log device
https://git.kernel.org/vfs/vfs/c/644ab8c64a12
[10/12] ext4: use fs_holder_ops for the log device
https://git.kernel.org/vfs/vfs/c/fba3de1aad77
[11/12] xfs: drop s_umount over opening the log and RT devices
https://git.kernel.org/vfs/vfs/c/9470514a171c
[12/12] xfs use fs_holder_ops for the log and RT devices
https://git.kernel.org/vfs/vfs/c/c6fb2ed736e3

2023-08-04 21:01:27

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH 09/12] ext4: drop s_umount over opening the log device

On Wed, Aug 02, 2023 at 05:41:28PM +0200, Christoph Hellwig wrote:
> Just like get_tree_bdev needs to drop s_umount when opening the main
> device, we need to do the same for the ext4 log device to avoid a
> potential lock order reversal with s_unmount for the mark_dead path.
>
> It might be preferable to just drop s_umount over ->fill_super entirely,
> but that will require a fairly massive audit first, so we'll do the easy
> version here first.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Acked-by: Theodore Ts'o <[email protected]>

2023-08-05 10:20:24

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 06/12] fs: use the super_block as holder when mounting file systems

On Thu, Aug 03, 2023 at 03:33:30PM +0200, Jan Kara wrote:
> As a side note, after this patch we can also remove bdev->bd_super and
> transition the two real users (mark_buffer_write_io_error() and two places
> in ocfs2) to use bd_holder. Ext4 also uses bd_super but there it is really
> pointless as we have the superblock directly available in that function
> anyway.

I actually have a series to kill bd_super, but it uses b_assoc_map
as the replacement, as nothing in buffer.c should poke into the holder
and the buffer_head codes uses b_assoc_map a lot anyway. Let me rebase
it and send it out.

2023-08-10 11:56:59

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Fri 04-08-23 11:01:39, Ryusuke Konishi wrote:
> On Thu, Aug 3, 2023 at 8:46 PM Jan Kara wrote:
> >
> > On Wed 02-08-23 17:41:21, Christoph Hellwig wrote:
> > > Use the generic setup_bdev_super helper to open the main block device
> > > and do various bits of superblock setup instead of duplicating the
> > > logic. This includes moving to the new scheme implemented in common
> > > code that only opens the block device after the superblock has allocated.
> > >
> > > It does not yet convert nilfs2 to the new mount API, but doing so will
> > > become a bit simpler after this first step.
> > >
> > > Signed-off-by: Christoph Hellwig <[email protected]>
> >
> > AFAICS nilfs2 could *almost* use mount_bdev() directly and then just do its
>
> > snapshot thing after mount_bdev() returns. But it has this weird logic
> > that: "if the superblock is already mounted but we can shrink the whole
> > dcache, then do remount instead of ignoring mount options". Firstly, this
> > looks racy - what prevents someone from say opening a file on the sb just
> > after nilfs_tree_is_busy() shrinks dcache? Secondly, it is inconsistent
> > with any other filesystem so it's going to surprise sysadmins not
> > intimately knowing nilfs2. Thirdly, from userspace you cannot tell what
> > your mount call is going to do. Last but not least, what is it really good
> > for? Ryusuke, can you explain please?
> >
> > Honza
>
> I think you are referring to the following part:
>
> > if (!s->s_root) {
> ...
> > } else if (!sd.cno) {
> > if (nilfs_tree_is_busy(s->s_root)) {
> > if ((flags ^ s->s_flags) & SB_RDONLY) {
> > nilfs_err(s,
> > "the device already has a %s mount.",
> > sb_rdonly(s) ? "read-only" : "read/write");
> > err = -EBUSY;
> > goto failed_super;
> > }
> > } else {
> > /*
> > * Try remount to setup mount states if the current
> > * tree is not mounted and only snapshots use this sb.
> > */
> > err = nilfs_remount(s, &flags, data);
> > if (err)
> > goto failed_super;
> > }
> > }
>
> What this logic is trying to do is, if there is already a nilfs2 mount
> instance for the device, and are trying to mounting the current tree
> (sd.cno is 0, so this is not a snapshot mount), then will switch
> depending on whether the current tree has a mount:
>
> - If the current tree is mounted, it's just like a normal filesystem.
> (A read-only mount and a read/write mount can't coexist, so check
> that, and reuse the instance if possible)
> - Otherwise, i.e. for snapshot mounts only, do whatever is necessary
> to add a new current mount, such as starting a log writer.
> Since it does the same thing that nilfs_remount does, so
> nilfs_remount() is used there.
>
> Whether or not there is a current tree mount can be determined by
> d_count(s->s_root) > 1 as nilfs_tree_is_busy() does.
> Where s->s_root is always the root dentry of the current tree, not
> that of the mounted snapshot.

I see now, thanks for explanation! But one thing still is not clear to me.
If you say have a snapshot mounted read-write and then you mount the
current snapshot (cno == 0) read-only, you'll switch the whole superblock
to read-only state. So also the mounted snapshot is suddently read-only
which is unexpected and actually supposedly breaks things because you can
still have file handles open for writing on the snapshot etc.. So how do
you solve that?

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-10 18:12:55

by Ryusuke Konishi

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Thu, Aug 10, 2023 at 8:05 PM Jan Kara wrote:
>
> On Fri 04-08-23 11:01:39, Ryusuke Konishi wrote:
> > On Thu, Aug 3, 2023 at 8:46 PM Jan Kara wrote:
> > >
> > > On Wed 02-08-23 17:41:21, Christoph Hellwig wrote:
> > > > Use the generic setup_bdev_super helper to open the main block device
> > > > and do various bits of superblock setup instead of duplicating the
> > > > logic. This includes moving to the new scheme implemented in common
> > > > code that only opens the block device after the superblock has allocated.
> > > >
> > > > It does not yet convert nilfs2 to the new mount API, but doing so will
> > > > become a bit simpler after this first step.
> > > >
> > > > Signed-off-by: Christoph Hellwig <[email protected]>
> > >
> > > AFAICS nilfs2 could *almost* use mount_bdev() directly and then just do its
> >
> > > snapshot thing after mount_bdev() returns. But it has this weird logic
> > > that: "if the superblock is already mounted but we can shrink the whole
> > > dcache, then do remount instead of ignoring mount options". Firstly, this
> > > looks racy - what prevents someone from say opening a file on the sb just
> > > after nilfs_tree_is_busy() shrinks dcache? Secondly, it is inconsistent
> > > with any other filesystem so it's going to surprise sysadmins not
> > > intimately knowing nilfs2. Thirdly, from userspace you cannot tell what
> > > your mount call is going to do. Last but not least, what is it really good
> > > for? Ryusuke, can you explain please?
> > >
> > > Honza
> >
> > I think you are referring to the following part:
> >
> > > if (!s->s_root) {
> > ...
> > > } else if (!sd.cno) {
> > > if (nilfs_tree_is_busy(s->s_root)) {
> > > if ((flags ^ s->s_flags) & SB_RDONLY) {
> > > nilfs_err(s,
> > > "the device already has a %s mount.",
> > > sb_rdonly(s) ? "read-only" : "read/write");
> > > err = -EBUSY;
> > > goto failed_super;
> > > }
> > > } else {
> > > /*
> > > * Try remount to setup mount states if the current
> > > * tree is not mounted and only snapshots use this sb.
> > > */
> > > err = nilfs_remount(s, &flags, data);
> > > if (err)
> > > goto failed_super;
> > > }
> > > }
> >
> > What this logic is trying to do is, if there is already a nilfs2 mount
> > instance for the device, and are trying to mounting the current tree
> > (sd.cno is 0, so this is not a snapshot mount), then will switch
> > depending on whether the current tree has a mount:
> >
> > - If the current tree is mounted, it's just like a normal filesystem.
> > (A read-only mount and a read/write mount can't coexist, so check
> > that, and reuse the instance if possible)
> > - Otherwise, i.e. for snapshot mounts only, do whatever is necessary
> > to add a new current mount, such as starting a log writer.
> > Since it does the same thing that nilfs_remount does, so
> > nilfs_remount() is used there.
> >
> > Whether or not there is a current tree mount can be determined by
> > d_count(s->s_root) > 1 as nilfs_tree_is_busy() does.
> > Where s->s_root is always the root dentry of the current tree, not
> > that of the mounted snapshot.
>
> I see now, thanks for explanation! But one thing still is not clear to me.
> If you say have a snapshot mounted read-write and then you mount the
> current snapshot (cno == 0) read-only, you'll switch the whole superblock
> to read-only state. So also the mounted snapshot is suddently read-only
> which is unexpected and actually supposedly breaks things because you can
> still have file handles open for writing on the snapshot etc.. So how do
> you solve that?
>
> Honza

One thing I have to tell you as a premise is that nilfs2's snapshot
mounts (cno != 0) are read-only.

The read-only option is mandatory for nilfs2 snapshot mounts, so
remounting to read/write mode will result in an error.
This constraint is checked in nilfs_parse_snapshot_option() which is
called from nilfs_identify().

In fact, any write mode file/inode operations on a snapshot mount will
result in an EROFS error, regardless of whether the coexisting current
tree mount is read-only or read/write (i.e. regardless of the
read-only flag of the superblock instance).

This is mostly (and possibly entirely) accomplished at the vfs layer
by checking the MNT_READONLY flag in mnt_flags of the vfsmount
structure, and even on the nilfs2 side, iops->permission
(=nilfs_permission) rejects write operations on snapshot mounts.

Therefore, the problem you pointed out shouldn't occur in the first
place since the situation where a snapshot with a handle in write mode
suddenly becomes read-only doesn't happen. Unless I'm missing
something..

Regards,
Ryusuke Konishi

2023-08-10 18:39:20

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 02/12] nilfs2: use setup_bdev_super to de-duplicate the mount code

On Fri 11-08-23 01:39:10, Ryusuke Konishi wrote:
> On Thu, Aug 10, 2023 at 8:05 PM Jan Kara wrote:
> >
> > On Fri 04-08-23 11:01:39, Ryusuke Konishi wrote:
> > > On Thu, Aug 3, 2023 at 8:46 PM Jan Kara wrote:
> > > >
> > > > On Wed 02-08-23 17:41:21, Christoph Hellwig wrote:
> > > > > Use the generic setup_bdev_super helper to open the main block device
> > > > > and do various bits of superblock setup instead of duplicating the
> > > > > logic. This includes moving to the new scheme implemented in common
> > > > > code that only opens the block device after the superblock has allocated.
> > > > >
> > > > > It does not yet convert nilfs2 to the new mount API, but doing so will
> > > > > become a bit simpler after this first step.
> > > > >
> > > > > Signed-off-by: Christoph Hellwig <[email protected]>
> > > >
> > > > AFAICS nilfs2 could *almost* use mount_bdev() directly and then just do its
> > >
> > > > snapshot thing after mount_bdev() returns. But it has this weird logic
> > > > that: "if the superblock is already mounted but we can shrink the whole
> > > > dcache, then do remount instead of ignoring mount options". Firstly, this
> > > > looks racy - what prevents someone from say opening a file on the sb just
> > > > after nilfs_tree_is_busy() shrinks dcache? Secondly, it is inconsistent
> > > > with any other filesystem so it's going to surprise sysadmins not
> > > > intimately knowing nilfs2. Thirdly, from userspace you cannot tell what
> > > > your mount call is going to do. Last but not least, what is it really good
> > > > for? Ryusuke, can you explain please?
> > > >
> > > > Honza
> > >
> > > I think you are referring to the following part:
> > >
> > > > if (!s->s_root) {
> > > ...
> > > > } else if (!sd.cno) {
> > > > if (nilfs_tree_is_busy(s->s_root)) {
> > > > if ((flags ^ s->s_flags) & SB_RDONLY) {
> > > > nilfs_err(s,
> > > > "the device already has a %s mount.",
> > > > sb_rdonly(s) ? "read-only" : "read/write");
> > > > err = -EBUSY;
> > > > goto failed_super;
> > > > }
> > > > } else {
> > > > /*
> > > > * Try remount to setup mount states if the current
> > > > * tree is not mounted and only snapshots use this sb.
> > > > */
> > > > err = nilfs_remount(s, &flags, data);
> > > > if (err)
> > > > goto failed_super;
> > > > }
> > > > }
> > >
> > > What this logic is trying to do is, if there is already a nilfs2 mount
> > > instance for the device, and are trying to mounting the current tree
> > > (sd.cno is 0, so this is not a snapshot mount), then will switch
> > > depending on whether the current tree has a mount:
> > >
> > > - If the current tree is mounted, it's just like a normal filesystem.
> > > (A read-only mount and a read/write mount can't coexist, so check
> > > that, and reuse the instance if possible)
> > > - Otherwise, i.e. for snapshot mounts only, do whatever is necessary
> > > to add a new current mount, such as starting a log writer.
> > > Since it does the same thing that nilfs_remount does, so
> > > nilfs_remount() is used there.
> > >
> > > Whether or not there is a current tree mount can be determined by
> > > d_count(s->s_root) > 1 as nilfs_tree_is_busy() does.
> > > Where s->s_root is always the root dentry of the current tree, not
> > > that of the mounted snapshot.
> >
> > I see now, thanks for explanation! But one thing still is not clear to me.
> > If you say have a snapshot mounted read-write and then you mount the
> > current snapshot (cno == 0) read-only, you'll switch the whole superblock
> > to read-only state. So also the mounted snapshot is suddently read-only
> > which is unexpected and actually supposedly breaks things because you can
> > still have file handles open for writing on the snapshot etc.. So how do
> > you solve that?
> >
> > Honza
>
> One thing I have to tell you as a premise is that nilfs2's snapshot
> mounts (cno != 0) are read-only.
>
> The read-only option is mandatory for nilfs2 snapshot mounts, so
> remounting to read/write mode will result in an error.
> This constraint is checked in nilfs_parse_snapshot_option() which is
> called from nilfs_identify().
>
> In fact, any write mode file/inode operations on a snapshot mount will
> result in an EROFS error, regardless of whether the coexisting current
> tree mount is read-only or read/write (i.e. regardless of the
> read-only flag of the superblock instance).
>
> This is mostly (and possibly entirely) accomplished at the vfs layer
> by checking the MNT_READONLY flag in mnt_flags of the vfsmount
> structure, and even on the nilfs2 side, iops->permission
> (=nilfs_permission) rejects write operations on snapshot mounts.
>
> Therefore, the problem you pointed out shouldn't occur in the first
> place since the situation where a snapshot with a handle in write mode
> suddenly becomes read-only doesn't happen. Unless I'm missing
> something..

No, I think you are correct. This particular case should be safe because
MNT_READONLY flags on the mounts used by snapshots will still keep them
read-only even if you remount the superblock to read-write mode for the
current snapshot. So I see why this is useful and I agree this isn't easy
to implement using mount_bdev() so no special code reduction here ;).
Thanks for patient explanation!

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2023-08-14 11:30:20

by Carlos Maiolino

[permalink] [raw]
Subject: Re: [PATCH 12/12] xfs use fs_holder_ops for the log and RT devices

On Wed, Aug 02, 2023 at 05:41:31PM +0200, Christoph Hellwig wrote:
> Use the generic fs_holder_ops to shut down the file system when the
> log or RT device goes away instead of duplicating the logic.
>
> Signed-off-by: Christoph Hellwig <[email protected]>
> ---
> fs/xfs/xfs_super.c | 17 +++--------------
> 1 file changed, 3 insertions(+), 14 deletions(-)

Looks good:

Reviewed-by: Carlos Maiolino <[email protected]>

Carlos
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d5042419ed9997..338eba71ff8667 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -377,17 +377,6 @@ xfs_setup_dax_always(
> return 0;
> }
>
> -static void
> -xfs_bdev_mark_dead(
> - struct block_device *bdev)
> -{
> - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
> -}
> -
> -static const struct blk_holder_ops xfs_holder_ops = {
> - .mark_dead = xfs_bdev_mark_dead,
> -};
> -
> STATIC int
> xfs_blkdev_get(
> xfs_mount_t *mp,
> @@ -396,8 +385,8 @@ xfs_blkdev_get(
> {
> int error = 0;
>
> - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
> - &xfs_holder_ops);
> + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
> + mp->m_super, &fs_holder_ops);
> if (IS_ERR(*bdevp)) {
> error = PTR_ERR(*bdevp);
> xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
> @@ -412,7 +401,7 @@ xfs_blkdev_put(
> struct block_device *bdev)
> {
> if (bdev)
> - blkdev_put(bdev, mp);
> + blkdev_put(bdev, mp->m_super);
> }
>
> STATIC void
> --
> 2.39.2
>

2023-08-14 11:58:37

by Carlos Maiolino

[permalink] [raw]
Subject: Re: [PATCH 12/12] xfs use fs_holder_ops for the log and RT devices

On Wed, Aug 02, 2023 at 05:41:31PM +0200, Christoph Hellwig wrote:
> Use the generic fs_holder_ops to shut down the file system when the
> log or RT device goes away instead of duplicating the logic.
>
> Signed-off-by: Christoph Hellwig <[email protected]>

Reviewed-by: Carlos Maiolino <[email protected]>

Carlos

> ---
> fs/xfs/xfs_super.c | 17 +++--------------
> 1 file changed, 3 insertions(+), 14 deletions(-)
>
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d5042419ed9997..338eba71ff8667 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -377,17 +377,6 @@ xfs_setup_dax_always(
> return 0;
> }
>
> -static void
> -xfs_bdev_mark_dead(
> - struct block_device *bdev)
> -{
> - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
> -}
> -
> -static const struct blk_holder_ops xfs_holder_ops = {
> - .mark_dead = xfs_bdev_mark_dead,
> -};
> -
> STATIC int
> xfs_blkdev_get(
> xfs_mount_t *mp,
> @@ -396,8 +385,8 @@ xfs_blkdev_get(
> {
> int error = 0;
>
> - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
> - &xfs_holder_ops);
> + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
> + mp->m_super, &fs_holder_ops);
> if (IS_ERR(*bdevp)) {
> error = PTR_ERR(*bdevp);
> xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
> @@ -412,7 +401,7 @@ xfs_blkdev_put(
> struct block_device *bdev)
> {
> if (bdev)
> - blkdev_put(bdev, mp);
> + blkdev_put(bdev, mp->m_super);
> }
>
> STATIC void
> --
> 2.39.2
>