2024-01-23 09:44:27

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 0/5] block: remove gfp_mask for blkdev_zone_mgmt()

Fueled by the LSFMM discussion on removing GFP_NOFS initiated by Willy,
I've looked into the sole GFP_NOFS allocation in zonefs. As it turned out,
it is only done for zone management commands and can be removed.

After digging into more callers of blkdev_zone_mgmt() I came to the
conclusion that the gfp_mask parameter can be removed alltogether.

So this series switches all callers of blkdev_zone_mgmt() to either use
GFP_KERNEL where possible or grab a memalloc_no{fs,io} context.

The final patch in this series is getting rid of the gfp_mask parameter.

Link: https://lore.kernel.org/all/[email protected]/

---
Johannes Thumshirn (5):
zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call
dm: dm-zoned: pass GFP_KERNEL to blkdev_zone_mgmt
btrfs: zoned: call blkdev_zone_mgmt in nofs scope
f2fs: guard blkdev_zone_mgmt with nofs scope
block: remove gfp_flags from blkdev_zone_mgmt

block/blk-zoned.c | 19 ++++++++-----------
drivers/md/dm-zoned-metadata.c | 2 +-
drivers/nvme/target/zns.c | 5 ++---
fs/btrfs/zoned.c | 35 +++++++++++++++++++++++++----------
fs/f2fs/segment.c | 15 ++++++++++++---
fs/zonefs/super.c | 2 +-
include/linux/blkdev.h | 2 +-
7 files changed, 50 insertions(+), 30 deletions(-)
---
base-commit: 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf
change-id: 20240110-zonefs_nofs-dd1e22b2e046

Best regards,
--
Johannes Thumshirn <[email protected]>



2024-01-23 09:44:41

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call

Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
zonefs_zone_mgmt().

As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
from a place that can recurse back into the filesystem on memory reclaim,
it is save to call blkdev_zone_mgmt() with GFP_KERNEL.

Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/zonefs/super.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 93971742613a..63fbac018c04 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,

trace_zonefs_zone_mgmt(sb, z, op);
ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
- z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+ z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
if (ret) {
zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",

--
2.43.0


2024-01-23 09:45:01

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 2/5] dm: dm-zoned: pass GFP_KERNEL to blkdev_zone_mgmt

The call to blkdev_zone_mgmt() in dm-zoned is only used to perform a
ZONE_RESET operation when freeing a zone.

This is not done in the IO path, so we can use GFP_KERNEL here, as it will
never recurse back into the driver on reclaim.

Signed-off-by: Johannes Thumshirn <[email protected]>
---
drivers/md/dm-zoned-metadata.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fdfe30f7b697..a39734f0cb7d 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1658,7 +1658,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)

ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
dmz_start_sect(zmd, zone),
- zmd->zone_nr_sectors, GFP_NOIO);
+ zmd->zone_nr_sectors, GFP_KERNEL);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
zone->id, ret);

--
2.43.0


2024-01-23 09:45:25

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 3/5] btrfs: zoned: call blkdev_zone_mgmt in nofs scope

Add a memalloc_nofs scope around all calls to blkdev_zone_mgmt(). This
allows us to further get rid of the GFP_NOFS argument for
blkdev_zone_mgmt().

Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/btrfs/zoned.c | 33 ++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 168af9d000d1..05640d61e435 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -824,11 +824,15 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];

if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));

+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
reset->start, reset->len,
- GFP_NOFS);
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;

@@ -974,11 +978,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;

+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -996,11 +1003,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)

int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;

zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1011,9 +1020,13 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;

- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES,
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}

/*
@@ -1124,12 +1137,15 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;

*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;

@@ -2234,14 +2250,17 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;

if (zinfo->max_active_zones == 0)
continue;

+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);

if (ret)
return ret;

--
2.43.0


2024-01-23 09:46:01

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 4/5] f2fs: guard blkdev_zone_mgmt with nofs scope

Guard the calls to blkdev_zone_mgmt() with a memalloc_nofs scope.
This helps us getting rid of the GFP_NOFS argument to blkdev_zone_mgmt();

Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/f2fs/segment.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90f..0094fe491364 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}

if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
+ unsigned int nofs_flags;
+ int ret;
+
trace_f2fs_issue_reset_zone(bdev, blkstart);
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sector, nr_sects, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sector, nr_sects, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}

__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
@@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
int ret;
+ unsigned int nofs_flags;

if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
@@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
"pointer: valid block[0x%x,0x%x] cond[0x%x]",
zone_segno, valid_block_cnt, zone->cond);

+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
- zone->start, zone->len, GFP_NOFS);
+ zone->start, zone->len, GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flags);
if (ret == -EOPNOTSUPP) {
ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
zone->len - (zone->wp - zone->start),

--
2.43.0


2024-01-23 09:51:57

by Johannes Thumshirn

[permalink] [raw]
Subject: [PATCH 5/5] block: remove gfp_flags from blkdev_zone_mgmt

Now that all callers pass in GFP_KERNEL to blkdev_zone_mgmt() and use
memalloc_no{io,fs}_{save,restore}() to define the allocation scope, we can
drop the gfp_mask parameter from blkdev_zone_mgmt() as well as
blkdev_zone_reset_all() and blkdev_zone_reset_all_emulated().

Signed-off-by: Johannes Thumshirn <[email protected]>
---
block/blk-zoned.c | 19 ++++++++-----------
drivers/nvme/target/zns.c | 5 ++---
fs/btrfs/zoned.c | 14 +++++---------
fs/f2fs/segment.c | 4 ++--
fs/zonefs/super.c | 2 +-
include/linux/blkdev.h | 2 +-
6 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index d343e5756a9c..d4f4f8325eff 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -177,8 +177,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
}
}

-static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
- gfp_t gfp_mask)
+static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = bdev_nr_sectors(bdev);
@@ -205,7 +204,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
}

bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
- gfp_mask);
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;

@@ -223,7 +222,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
return ret;
}

-static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
+static int blkdev_zone_reset_all(struct block_device *bdev)
{
struct bio bio;

@@ -238,7 +237,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* @sector: Start sector of the first zone to operate on
* @nr_sectors: Number of sectors, should be at least the length of one zone and
* must be zone size aligned.
- * @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
* Perform the specified operation on the range of zones specified by
@@ -248,7 +246,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* or finish request.
*/
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
- sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
+ sector_t sector, sector_t nr_sectors)
{
struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors = bdev_zone_sectors(bdev);
@@ -285,12 +283,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
*/
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q))
- return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
- return blkdev_zone_reset_all(bdev, gfp_mask);
+ return blkdev_zone_reset_all_emulated(bdev);
+ return blkdev_zone_reset_all(bdev);
}

while (sector < end_sector) {
- bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;

@@ -419,8 +417,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}

- ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
- GFP_KERNEL);
+ ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);

fail:
if (cmd == BLKRESETZONE)
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 5b5c1e481722..3148d9f1bde6 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -456,8 +456,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req)
switch (zsa_req_op(req->cmd->zms.zsa)) {
case REQ_OP_ZONE_RESET:
ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0,
- get_capacity(req->ns->bdev->bd_disk),
- GFP_KERNEL);
+ get_capacity(req->ns->bdev->bd_disk));
if (ret < 0)
return blkdev_zone_mgmt_errno_to_nvme_status(ret);
break;
@@ -508,7 +507,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
goto out;
}

- ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL);
+ ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors);
if (ret < 0)
status = blkdev_zone_mgmt_errno_to_nvme_status(ret);

diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 05640d61e435..cf2e779d8ef4 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -830,8 +830,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,

nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_KERNEL);
+ reset->start, reset->len);
memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -984,7 +983,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_KERNEL);
+ zone->len);
memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -1023,8 +1022,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES,
- GFP_KERNEL);
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
memalloc_nofs_restore(nofs_flags);
return ret;
}
@@ -1143,8 +1141,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
*bytes = 0;
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_KERNEL);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -2258,8 +2255,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_KERNEL);
+ zinfo->zone_size >> SECTOR_SHIFT);
memalloc_nofs_restore(nofs_flags);

if (ret)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0094fe491364..e1065ba70207 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1977,7 +1977,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
trace_f2fs_issue_reset_zone(bdev, blkstart);
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sector, nr_sects, GFP_KERNEL);
+ sector, nr_sects);
memalloc_nofs_restore(nofs_flags);
return ret;
}
@@ -4921,7 +4921,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,

nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
- zone->start, zone->len, GFP_KERNEL);
+ zone->start, zone->len);
memalloc_nofs_restore(nofs_flags);
if (ret == -EOPNOTSUPP) {
ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 63fbac018c04..cadb1364f951 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,

trace_zonefs_zone_mgmt(sb, z, op);
ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
- z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
+ z->z_size >> SECTOR_SHIFT);
if (ret) {
zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99e4f5e72213..8467c1910404 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -325,7 +325,7 @@ void disk_set_zoned(struct gendisk *disk);
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
- sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask);
+ sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk,
void (*update_driver_data)(struct gendisk *disk));


--
2.43.0


2024-01-23 20:40:47

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call



On Tue, 23 Jan 2024, Johannes Thumshirn wrote:

> Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
> zonefs_zone_mgmt().
>
> As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
> from a place that can recurse back into the filesystem on memory reclaim,
> it is save to call blkdev_zone_mgmt() with GFP_KERNEL.
>
> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
> fs/zonefs/super.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 93971742613a..63fbac018c04 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
>
> trace_zonefs_zone_mgmt(sb, z, op);
> ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
> - z->z_size >> SECTOR_SHIFT, GFP_NOFS);
> + z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
> if (ret) {
> zonefs_err(sb,
> "Zone management operation %s at %llu failed %d\n",
>
> --
> 2.43.0

zonefs_inode_zone_mgmt calls
lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - so, this
function is called with the mutex held - could it happen that the
GFP_KERNEL allocation recurses into the filesystem and attempts to take
i_truncate_mutex as well?

i.e. GFP_KERNEL -> iomap_do_writepage -> zonefs_write_map_blocks ->
zonefs_write_iomap_begin -> mutex_lock(&zi->i_truncate_mutex)

Mikulas


2024-01-23 20:54:42

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [PATCH 2/5] dm: dm-zoned: pass GFP_KERNEL to blkdev_zone_mgmt



On Tue, 23 Jan 2024, Johannes Thumshirn wrote:

> The call to blkdev_zone_mgmt() in dm-zoned is only used to perform a
> ZONE_RESET operation when freeing a zone.
>
> This is not done in the IO path, so we can use GFP_KERNEL here, as it will
> never recurse back into the driver on reclaim.

Hi

This doesn't seem safe to me. There's a possible call chain dmz_handle_bio
-> dmz_put_chunk_mapping -> dmz_free_zone -> dmz_reset_zone ->
blkdev_zone_mgmt -> recursion (via GFP_KERNEL) back into the dm-zoned
target.

Mikulas

> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
> drivers/md/dm-zoned-metadata.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
> index fdfe30f7b697..a39734f0cb7d 100644
> --- a/drivers/md/dm-zoned-metadata.c
> +++ b/drivers/md/dm-zoned-metadata.c
> @@ -1658,7 +1658,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>
> ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
> dmz_start_sect(zmd, zone),
> - zmd->zone_nr_sectors, GFP_NOIO);
> + zmd->zone_nr_sectors, GFP_KERNEL);
> if (ret) {
> dmz_dev_err(dev, "Reset zone %u failed %d",
> zone->id, ret);
>
> --
> 2.43.0
>
>


2024-01-23 23:23:02

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call

On Tue, Jan 23, 2024 at 09:39:02PM +0100, Mikulas Patocka wrote:
>
>
> On Tue, 23 Jan 2024, Johannes Thumshirn wrote:
>
> > Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
> > zonefs_zone_mgmt().
> >
> > As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
> > from a place that can recurse back into the filesystem on memory reclaim,
> > it is save to call blkdev_zone_mgmt() with GFP_KERNEL.
> >
> > Signed-off-by: Johannes Thumshirn <[email protected]>
> > ---
> > fs/zonefs/super.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index 93971742613a..63fbac018c04 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
> >
> > trace_zonefs_zone_mgmt(sb, z, op);
> > ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
> > - z->z_size >> SECTOR_SHIFT, GFP_NOFS);
> > + z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
> > if (ret) {
> > zonefs_err(sb,
> > "Zone management operation %s at %llu failed %d\n",
> >
> > --
> > 2.43.0
>
> zonefs_inode_zone_mgmt calls
> lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - so, this
> function is called with the mutex held - could it happen that the
> GFP_KERNEL allocation recurses into the filesystem and attempts to take
> i_truncate_mutex as well?
>
> i.e. GFP_KERNEL -> iomap_do_writepage -> zonefs_write_map_blocks ->
> zonefs_write_iomap_begin -> mutex_lock(&zi->i_truncate_mutex)

zonefs doesn't have a ->writepage method, so writeback can't be
called from memory reclaim like this.

-Dave.
--
Dave Chinner
[email protected]

2024-01-23 23:31:30

by Damien Le Moal

[permalink] [raw]
Subject: Re: [PATCH 1/5] zonefs: pass GFP_KERNEL to blkdev_zone_mgmt() call

On 1/24/24 08:21, Dave Chinner wrote:
> On Tue, Jan 23, 2024 at 09:39:02PM +0100, Mikulas Patocka wrote:
>>
>>
>> On Tue, 23 Jan 2024, Johannes Thumshirn wrote:
>>
>>> Pass GFP_KERNEL instead of GFP_NOFS to the blkdev_zone_mgmt() call in
>>> zonefs_zone_mgmt().
>>>
>>> As as zonefs_zone_mgmt() and zonefs_inode_zone_mgmt() are never called
>>> from a place that can recurse back into the filesystem on memory reclaim,
>>> it is save to call blkdev_zone_mgmt() with GFP_KERNEL.
>>>
>>> Signed-off-by: Johannes Thumshirn <[email protected]>
>>> ---
>>> fs/zonefs/super.c | 2 +-
>>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>>> index 93971742613a..63fbac018c04 100644
>>> --- a/fs/zonefs/super.c
>>> +++ b/fs/zonefs/super.c
>>> @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
>>>
>>> trace_zonefs_zone_mgmt(sb, z, op);
>>> ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
>>> - z->z_size >> SECTOR_SHIFT, GFP_NOFS);
>>> + z->z_size >> SECTOR_SHIFT, GFP_KERNEL);
>>> if (ret) {
>>> zonefs_err(sb,
>>> "Zone management operation %s at %llu failed %d\n",
>>>
>>> --
>>> 2.43.0
>>
>> zonefs_inode_zone_mgmt calls
>> lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - so, this
>> function is called with the mutex held - could it happen that the
>> GFP_KERNEL allocation recurses into the filesystem and attempts to take
>> i_truncate_mutex as well?
>>
>> i.e. GFP_KERNEL -> iomap_do_writepage -> zonefs_write_map_blocks ->
>> zonefs_write_iomap_begin -> mutex_lock(&zi->i_truncate_mutex)
>
> zonefs doesn't have a ->writepage method, so writeback can't be
> called from memory reclaim like this.

And also, buffered writes are allowed only for conventional zone files, for
which we never do zone management. For sequential zone files which may have
there zone managed with blkdev_zone_mgmt(), only direct writes are allowed.

>
> -Dave.

--
Damien Le Moal
Western Digital Research


2024-01-26 22:05:12

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 5/5] block: remove gfp_flags from blkdev_zone_mgmt

Hi Johannes,

kernel test robot noticed the following build errors:

[auto build test ERROR on 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf]

url: https://github.com/intel-lab-lkp/linux/commits/Johannes-Thumshirn/zonefs-pass-GFP_KERNEL-to-blkdev_zone_mgmt-call/20240123-174911
base: 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf
patch link: https://lore.kernel.org/r/20240123-zonefs_nofs-v1-5-cc0b0308ef25%40wdc.com
patch subject: [PATCH 5/5] block: remove gfp_flags from blkdev_zone_mgmt
config: hexagon-allmodconfig (https://download.01.org/0day-ci/archive/20240127/[email protected]/config)
compiler: clang version 18.0.0git (https://github.com/llvm/llvm-project a31a60074717fc40887cfe132b77eec93bedd307)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240127/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

In file included from drivers/md/dm-zoned-metadata.c:8:
In file included from drivers/md/dm-zoned.h:12:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:328:
include/asm-generic/io.h:547:31: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
547 | val = __raw_readb(PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:560:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
560 | val = __le16_to_cpu((__le16 __force)__raw_readw(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:37:51: note: expanded from macro '__le16_to_cpu'
37 | #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
| ^
In file included from drivers/md/dm-zoned-metadata.c:8:
In file included from drivers/md/dm-zoned.h:12:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:328:
include/asm-generic/io.h:573:61: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
573 | val = __le32_to_cpu((__le32 __force)__raw_readl(PCI_IOBASE + addr));
| ~~~~~~~~~~ ^
include/uapi/linux/byteorder/little_endian.h:35:51: note: expanded from macro '__le32_to_cpu'
35 | #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
| ^
In file included from drivers/md/dm-zoned-metadata.c:8:
In file included from drivers/md/dm-zoned.h:12:
In file included from include/linux/blkdev.h:9:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:12:
In file included from include/linux/hardirq.h:11:
In file included from ./arch/hexagon/include/generated/asm/hardirq.h:1:
In file included from include/asm-generic/hardirq.h:17:
In file included from include/linux/irq.h:20:
In file included from include/linux/io.h:13:
In file included from arch/hexagon/include/asm/io.h:328:
include/asm-generic/io.h:584:33: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
584 | __raw_writeb(value, PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:594:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
594 | __raw_writew((u16 __force)cpu_to_le16(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
include/asm-generic/io.h:604:59: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic]
604 | __raw_writel((u32 __force)cpu_to_le32(value), PCI_IOBASE + addr);
| ~~~~~~~~~~ ^
>> drivers/md/dm-zoned-metadata.c:1661:34: error: too many arguments to function call, expected 4, have 5
1659 | ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
| ~~~~~~~~~~~~~~~~
1660 | dmz_start_sect(zmd, zone),
1661 | zmd->zone_nr_sectors, GFP_KERNEL);
| ^~~~~~~~~~
include/linux/gfp_types.h:327:20: note: expanded from macro 'GFP_KERNEL'
327 | #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/blkdev.h:327:5: note: 'blkdev_zone_mgmt' declared here
327 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
328 | sector_t sectors, sector_t nr_sectors);
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6 warnings and 1 error generated.


vim +1661 drivers/md/dm-zoned-metadata.c

3b1a94c88b798d Damien Le Moal 2017-06-07 1639
3b1a94c88b798d Damien Le Moal 2017-06-07 1640 /*
3b1a94c88b798d Damien Le Moal 2017-06-07 1641 * Reset a zone write pointer.
3b1a94c88b798d Damien Le Moal 2017-06-07 1642 */
3b1a94c88b798d Damien Le Moal 2017-06-07 1643 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
3b1a94c88b798d Damien Le Moal 2017-06-07 1644 {
3b1a94c88b798d Damien Le Moal 2017-06-07 1645 int ret;
3b1a94c88b798d Damien Le Moal 2017-06-07 1646
3b1a94c88b798d Damien Le Moal 2017-06-07 1647 /*
3b1a94c88b798d Damien Le Moal 2017-06-07 1648 * Ignore offline zones, read only zones,
3b1a94c88b798d Damien Le Moal 2017-06-07 1649 * and conventional zones.
3b1a94c88b798d Damien Le Moal 2017-06-07 1650 */
3b1a94c88b798d Damien Le Moal 2017-06-07 1651 if (dmz_is_offline(zone) ||
3b1a94c88b798d Damien Le Moal 2017-06-07 1652 dmz_is_readonly(zone) ||
3b1a94c88b798d Damien Le Moal 2017-06-07 1653 dmz_is_rnd(zone))
3b1a94c88b798d Damien Le Moal 2017-06-07 1654 return 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1655
3b1a94c88b798d Damien Le Moal 2017-06-07 1656 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
8f22272af7a727 Hannes Reinecke 2020-06-02 1657 struct dmz_dev *dev = zone->dev;
3b1a94c88b798d Damien Le Moal 2017-06-07 1658
6c1b1da58f8c7a Ajay Joshi 2019-10-27 1659 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
3b1a94c88b798d Damien Le Moal 2017-06-07 1660 dmz_start_sect(zmd, zone),
c4d4977392621f Johannes Thumshirn 2024-01-23 @1661 zmd->zone_nr_sectors, GFP_KERNEL);
3b1a94c88b798d Damien Le Moal 2017-06-07 1662 if (ret) {
3b1a94c88b798d Damien Le Moal 2017-06-07 1663 dmz_dev_err(dev, "Reset zone %u failed %d",
b71228739851a9 Hannes Reinecke 2020-05-11 1664 zone->id, ret);
3b1a94c88b798d Damien Le Moal 2017-06-07 1665 return ret;
3b1a94c88b798d Damien Le Moal 2017-06-07 1666 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1667 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1668
3b1a94c88b798d Damien Le Moal 2017-06-07 1669 /* Clear write error bit and rewind write pointer position */
3b1a94c88b798d Damien Le Moal 2017-06-07 1670 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
3b1a94c88b798d Damien Le Moal 2017-06-07 1671 zone->wp_block = 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1672
3b1a94c88b798d Damien Le Moal 2017-06-07 1673 return 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1674 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1675

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2024-01-27 09:58:40

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH 5/5] block: remove gfp_flags from blkdev_zone_mgmt

Hi Johannes,

kernel test robot noticed the following build errors:

[auto build test ERROR on 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf]

url: https://github.com/intel-lab-lkp/linux/commits/Johannes-Thumshirn/zonefs-pass-GFP_KERNEL-to-blkdev_zone_mgmt-call/20240123-174911
base: 7ed2632ec7d72e926b9e8bcc9ad1bb0cd37274bf
patch link: https://lore.kernel.org/r/20240123-zonefs_nofs-v1-5-cc0b0308ef25%40wdc.com
patch subject: [PATCH 5/5] block: remove gfp_flags from blkdev_zone_mgmt
config: alpha-allyesconfig (https://download.01.org/0day-ci/archive/20240127/[email protected]/config)
compiler: alpha-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240127/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

drivers/md/dm-zoned-metadata.c: In function 'dmz_reset_zone':
>> drivers/md/dm-zoned-metadata.c:1659:23: error: too many arguments to function 'blkdev_zone_mgmt'
1659 | ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
| ^~~~~~~~~~~~~~~~
In file included from drivers/md/dm-zoned.h:12,
from drivers/md/dm-zoned-metadata.c:8:
include/linux/blkdev.h:327:5: note: declared here
327 | int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
| ^~~~~~~~~~~~~~~~


vim +/blkdev_zone_mgmt +1659 drivers/md/dm-zoned-metadata.c

3b1a94c88b798d Damien Le Moal 2017-06-07 1639
3b1a94c88b798d Damien Le Moal 2017-06-07 1640 /*
3b1a94c88b798d Damien Le Moal 2017-06-07 1641 * Reset a zone write pointer.
3b1a94c88b798d Damien Le Moal 2017-06-07 1642 */
3b1a94c88b798d Damien Le Moal 2017-06-07 1643 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
3b1a94c88b798d Damien Le Moal 2017-06-07 1644 {
3b1a94c88b798d Damien Le Moal 2017-06-07 1645 int ret;
3b1a94c88b798d Damien Le Moal 2017-06-07 1646
3b1a94c88b798d Damien Le Moal 2017-06-07 1647 /*
3b1a94c88b798d Damien Le Moal 2017-06-07 1648 * Ignore offline zones, read only zones,
3b1a94c88b798d Damien Le Moal 2017-06-07 1649 * and conventional zones.
3b1a94c88b798d Damien Le Moal 2017-06-07 1650 */
3b1a94c88b798d Damien Le Moal 2017-06-07 1651 if (dmz_is_offline(zone) ||
3b1a94c88b798d Damien Le Moal 2017-06-07 1652 dmz_is_readonly(zone) ||
3b1a94c88b798d Damien Le Moal 2017-06-07 1653 dmz_is_rnd(zone))
3b1a94c88b798d Damien Le Moal 2017-06-07 1654 return 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1655
3b1a94c88b798d Damien Le Moal 2017-06-07 1656 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
8f22272af7a727 Hannes Reinecke 2020-06-02 1657 struct dmz_dev *dev = zone->dev;
3b1a94c88b798d Damien Le Moal 2017-06-07 1658
6c1b1da58f8c7a Ajay Joshi 2019-10-27 @1659 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
3b1a94c88b798d Damien Le Moal 2017-06-07 1660 dmz_start_sect(zmd, zone),
c4d4977392621f Johannes Thumshirn 2024-01-23 1661 zmd->zone_nr_sectors, GFP_KERNEL);
3b1a94c88b798d Damien Le Moal 2017-06-07 1662 if (ret) {
3b1a94c88b798d Damien Le Moal 2017-06-07 1663 dmz_dev_err(dev, "Reset zone %u failed %d",
b71228739851a9 Hannes Reinecke 2020-05-11 1664 zone->id, ret);
3b1a94c88b798d Damien Le Moal 2017-06-07 1665 return ret;
3b1a94c88b798d Damien Le Moal 2017-06-07 1666 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1667 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1668
3b1a94c88b798d Damien Le Moal 2017-06-07 1669 /* Clear write error bit and rewind write pointer position */
3b1a94c88b798d Damien Le Moal 2017-06-07 1670 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
3b1a94c88b798d Damien Le Moal 2017-06-07 1671 zone->wp_block = 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1672
3b1a94c88b798d Damien Le Moal 2017-06-07 1673 return 0;
3b1a94c88b798d Damien Le Moal 2017-06-07 1674 }
3b1a94c88b798d Damien Le Moal 2017-06-07 1675

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki