As already reported in v3, the nvme_ns_head change was not gaining us anything.
Instead a simple repacking of nvme_ns gives better performance. Thus these
patches are gone.
Thanks,
Daniel
libnvme changes:
https://github.com/igaw/libnvme/tree/tree-no-cmd
changes:
v4:
- drop 'use nvme_ns_head instead nvme_ns' patches
- ratelimit nuse update per namespace not globally
- rename ns attribute group
- added non-multipath nuse update logic
- added cacheline optimization
v3:
- cut overlong lines shorter
- fixed disk (queuedata) initialization order
- more testing with blktest
- added nuse ratelimit
- added reviewed tags
- https://lore.kernel.org/linux-nvme/[email protected]/
v2:
- moved ns id data to nvme_ns_head
- dropped ds, nsze
- https://lore.kernel.org/linux-nvme/[email protected]/
v1:
- initial version
- https://lore.kernel.org/linux-nvme/[email protected]/
Daniel Wagner (4):
nvme: move ns id info to struct nvme_ns_head
nvme: rename ns attribute group
nvme: add csi, ms and nuse to sysfs
nvme: repack struct nvme_ns_head
drivers/nvme/host/core.c | 87 ++++++++++++++--------------
drivers/nvme/host/ioctl.c | 8 +--
drivers/nvme/host/multipath.c | 2 +-
drivers/nvme/host/nvme.h | 38 +++++++------
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/sysfs.c | 103 +++++++++++++++++++++++++++++++---
drivers/nvme/host/zns.c | 17 +++---
7 files changed, 179 insertions(+), 78 deletions(-)
--
2.43.0
Move the namesapce info to struct nvme_ns_head, because it's the same
for all associated namespaces.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 81 ++++++++++++++++++++-------------------
drivers/nvme/host/ioctl.c | 8 ++--
drivers/nvme/host/nvme.h | 28 +++++++-------
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/zns.c | 17 ++++----
5 files changed, 70 insertions(+), 66 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d699f0c8b13e..72908e622049 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -312,12 +312,12 @@ static void nvme_log_error(struct request *req)
struct nvme_request *nr = nvme_req(req);
if (ns) {
- pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
+ pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
ns->disk ? ns->disk->disk_name : "?",
nvme_get_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
- (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
- (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
+ nvme_sect_to_lba(ns, blk_rq_pos(req)),
+ blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
nr->status & 0xff, /* Status Code */
@@ -794,7 +794,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
if (queue_max_discard_segments(req->q) == 1) {
u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
- u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
+ u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
range[0].cattr = cpu_to_le32(0);
range[0].nlb = cpu_to_le32(nlb);
@@ -803,7 +803,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
} else {
__rq_for_each_bio(bio, req) {
u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+ u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
if (n < segments) {
range[n].cattr = cpu_to_le32(0);
@@ -841,7 +841,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
u64 ref48;
/* both rw and write zeroes share the same reftag format */
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
@@ -871,15 +871,16 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.slba =
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
- cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
- if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
+ if (!(req->cmd_flags & REQ_NOUNMAP) &&
+ (ns->head->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
if (nvme_ns_has_pi(ns)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
nvme_set_ref_tag(ns, cmnd, req);
@@ -912,12 +913,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
- cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cmnd->rw.length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
cmnd->rw.apptag = 0;
cmnd->rw.appmask = 0;
- if (ns->ms) {
+ if (ns->head->ms) {
/*
* If formated with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
@@ -930,7 +932,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
@@ -1676,9 +1678,9 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
{
struct blk_integrity integrity = { };
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1696,7 +1698,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
@@ -1717,7 +1719,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
}
- integrity.tuple_size = ns->ms;
+ integrity.tuple_size = ns->head->ms;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
@@ -1776,11 +1778,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
int ret = 0;
u32 elbaf;
- ns->pi_size = 0;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ ns->head->pi_size = 0;
+ ns->head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- ns->pi_size = sizeof(struct t10_pi_tuple);
- ns->guard_type = NVME_NVM_NS_16B_GUARD;
+ ns->head->pi_size = sizeof(struct t10_pi_tuple);
+ ns->head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
@@ -1803,13 +1805,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
if (nvme_elbaf_sts(elbaf))
goto free_data;
- ns->guard_type = nvme_elbaf_guard_type(elbaf);
- switch (ns->guard_type) {
+ ns->head->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
- ns->pi_size = sizeof(struct crc64_pi_tuple);
+ ns->head->pi_size = sizeof(struct crc64_pi_tuple);
break;
case NVME_NVM_NS_16B_GUARD:
- ns->pi_size = sizeof(struct t10_pi_tuple);
+ ns->head->pi_size = sizeof(struct t10_pi_tuple);
break;
default:
break;
@@ -1818,10 +1820,10 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
free_data:
kfree(nvm);
set_pi:
- if (ns->pi_size && (first || ns->ms == ns->pi_size))
- ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (ns->head->pi_size && (first || ns->head->ms == ns->head->pi_size))
+ ns->head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
- ns->pi_type = 0;
+ ns->head->pi_type = 0;
return ret;
}
@@ -1835,8 +1837,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (ret)
return ret;
- ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
- if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ ns->head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ if (!ns->head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -1848,7 +1850,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0;
- ns->features |= NVME_NS_EXT_LBAS;
+ ns->head->features |= NVME_NS_EXT_LBAS;
/*
* The current fabrics transport drivers support namespace
@@ -1860,7 +1862,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* gain the ability to use other metadata formats.
*/
if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ ns->head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
* For PCIe controllers, we can't easily remap the separate
@@ -1869,9 +1871,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* We allow extended LBAs for the passthrough interface, though.
*/
if (id->flbas & NVME_NS_FLBAS_META_EXT)
- ns->features |= NVME_NS_EXT_LBAS;
+ ns->head->features |= NVME_NS_EXT_LBAS;
else
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ ns->head->features |= NVME_NS_METADATA_SUPPORTED;
}
return 0;
}
@@ -1898,7 +1900,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
- u32 bs = 1U << ns->lba_shift;
+ u32 bs = 1U << ns->head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
/*
@@ -1906,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
* or smaller than a sector size yet, so catch this early and don't
* allow block I/O.
*/
- if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
+ if (ns->head->lba_shift > PAGE_SHIFT ||
+ ns->head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9);
}
@@ -1949,9 +1952,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
- if (ns->ms) {
+ if (ns->head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (ns->features & NVME_NS_METADATA_SUPPORTED))
+ (ns->head->features & NVME_NS_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns,
ns->ctrl->max_integrity_segments);
else if (!nvme_ns_has_pi(ns))
@@ -2052,7 +2055,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
- ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
ret = nvme_configure_metadata(ns, id);
@@ -2078,7 +2081,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
* do not return zeroes.
*/
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
- ns->features |= NVME_NS_DEAC;
+ ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 529b9954d2b8..feee9cf50670 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -283,10 +283,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
return -EINVAL;
}
- length = (io.nblocks + 1) << ns->lba_shift;
+ length = (io.nblocks + 1) << ns->head->lba_shift;
if ((io.control & NVME_RW_PRINFO_PRACT) &&
- ns->ms == sizeof(struct t10_pi_tuple)) {
+ ns->head->ms == sizeof(struct t10_pi_tuple)) {
/*
* Protection information is stripped/inserted by the
* controller.
@@ -296,11 +296,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
meta_len = 0;
metadata = NULL;
} else {
- meta_len = (io.nblocks + 1) * ns->ms;
+ meta_len = (io.nblocks + 1) * ns->head->ms;
metadata = nvme_to_user_ptr(io.metadata);
}
- if (ns->features & NVME_NS_EXT_LBAS) {
+ if (ns->head->features & NVME_NS_EXT_LBAS) {
length += meta_len;
meta_len = 0;
} else if (meta_len) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 578e6d311bc9..1ebe6a9b42c9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -451,6 +451,17 @@ struct nvme_ns_head {
bool shared;
int instance;
struct nvme_effects_log *effects;
+ int lba_shift;
+ u16 ms;
+ u16 pi_size;
+ u16 sgs;
+ u32 sws;
+ u8 pi_type;
+ u8 guard_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+ u64 zsze;
+#endif
+ unsigned long features;
struct cdev cdev;
struct device cdev_device;
@@ -492,17 +503,6 @@ struct nvme_ns {
struct kref kref;
struct nvme_ns_head *head;
- int lba_shift;
- u16 ms;
- u16 pi_size;
- u16 sgs;
- u32 sws;
- u8 pi_type;
- u8 guard_type;
-#ifdef CONFIG_BLK_DEV_ZONED
- u64 zsze;
-#endif
- unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_ANA_PENDING 2
@@ -519,7 +519,7 @@ struct nvme_ns {
/* NVMe ns supports metadata actions by the controller (generate/strip) */
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
{
- return ns->pi_type && ns->ms == ns->pi_size;
+ return ns->head->pi_type && ns->head->ms == ns->head->pi_size;
}
struct nvme_ctrl_ops {
@@ -653,7 +653,7 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
*/
static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
{
- return sector >> (ns->lba_shift - SECTOR_SHIFT);
+ return sector >> (ns->head->lba_shift - SECTOR_SHIFT);
}
/*
@@ -661,7 +661,7 @@ static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
*/
static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
{
- return lba << (ns->lba_shift - SECTOR_SHIFT);
+ return lba << (ns->head->lba_shift - SECTOR_SHIFT);
}
/*
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 81e2621169e5..fc0df91e6b36 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1423,7 +1423,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
goto mr_put;
nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
- req->mr->sig_attrs, ns->pi_type);
+ req->mr->sig_attrs, ns->head->pi_type);
nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ec8557810c21..2a8871e0cc9b 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -11,7 +11,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns)
{
struct request_queue *q = ns->queue;
- blk_queue_chunk_sectors(q, ns->zsze);
+ blk_queue_chunk_sectors(q, ns->head->zsze);
blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
return blk_revalidate_disk_zones(ns->disk, NULL);
@@ -99,11 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
- if (!is_power_of_2(ns->zsze)) {
+ ns->head->zsze =
+ nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+ if (!is_power_of_2(ns->head->zsze)) {
dev_warn(ns->ctrl->device,
"invalid zone size:%llu for namespace:%u\n",
- ns->zsze, ns->head->ns_id);
+ ns->head->zsze, ns->head->ns_id);
status = -ENODEV;
goto free_data;
}
@@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
sizeof(struct nvme_zone_descriptor);
nr_zones = min_t(unsigned int, nr_zones,
- get_capacity(ns->disk) >> ilog2(ns->zsze));
+ get_capacity(ns->head->disk) >> ilog2(ns->head->zsze));
bufsize = sizeof(struct nvme_zone_report) +
nr_zones * sizeof(struct nvme_zone_descriptor);
@@ -162,7 +163,7 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
- zone.len = ns->zsze;
+ zone.len = ns->head->zsze;
zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
if (zone.cond == BLK_ZONE_COND_FULL)
@@ -196,7 +197,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
- sector &= ~(ns->zsze - 1);
+ sector &= ~(ns->head->zsze - 1);
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
memset(report, 0, buflen);
@@ -220,7 +221,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
zone_idx++;
}
- sector += ns->zsze * nz;
+ sector += ns->head->zsze * nz;
}
if (zone_idx > 0)
--
2.43.0
libnvme is using the sysfs for enumarating the nvme resources. Though
there are few missing attritbutes in the sysfs. For these libnvme issues
commands during discovering.
As the kernel already knows all these attributes and we would like to
avoid libnvme to issue commands all the time, expose these missing
attributes.
The nuse value is updated on request because the nuse is a volatile
value. Since any user can read the sysfs attribute, a very simple rate
limit is added (update once every 5 seconds). A more sophisticated
update strategy can be added later if there is actually a need for it.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 4 +-
drivers/nvme/host/nvme.h | 6 +++
drivers/nvme/host/sysfs.c | 89 +++++++++++++++++++++++++++++++++++++++
3 files changed, 98 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c3270818fa0d..82c6faf424d6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1454,7 +1454,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
return status;
}
-static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id)
{
struct nvme_command c = { };
@@ -2056,6 +2056,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->nuse = le64_to_cpu(id->nuse);
nvme_set_queue_limits(ns->ctrl, ns->queue);
ret = nvme_configure_metadata(ns, id);
@@ -3418,6 +3419,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
kref_init(&head->ref);
if (head->ids.csi) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 32ec7ca30d7c..2b31641a97b9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
#include <linux/rcupdate.h>
#include <linux/wait.h>
#include <linux/t10-pi.h>
+#include <linux/ratelimit_types.h>
#include <trace/events/block.h>
@@ -456,6 +457,7 @@ struct nvme_ns_head {
u16 pi_size;
u16 sgs;
u32 sws;
+ u64 nuse;
u8 pi_type;
u8 guard_type;
#ifdef CONFIG_BLK_DEV_ZONED
@@ -463,6 +465,8 @@ struct nvme_ns_head {
#endif
unsigned long features;
+ struct ratelimit_state rs_nuse;
+
struct cdev cdev;
struct device cdev_device;
@@ -867,6 +871,8 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+ struct nvme_id_ns **id);
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index d682d0a667a0..79e362e0ac47 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -114,12 +114,101 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
+static ssize_t csi_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ids.csi);
+}
+static DEVICE_ATTR_RO(csi);
+
+static ssize_t metadata_bytes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ms);
+}
+static DEVICE_ATTR_RO(metadata_bytes);
+
+static int ns_head_update_nuse(struct nvme_ns_head *head)
+{
+ struct nvme_id_ns *id;
+ struct nvme_ns *ns;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ /* Avoid issuing commands too often by rate limiting the update */
+ if (!__ratelimit(&head->rs_nuse))
+ return 0;
+
+ pr_info("%s: head %p\n", __func__, head);
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (!ns)
+ goto out_unlock;
+
+ ret = nvme_identify_ns(ns->ctrl, head->ns_id, &id);
+ if (ret)
+ goto out_unlock;
+
+ head->nuse = le64_to_cpu(id->nuse);
+ kfree(id);
+
+out_unlock:
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
+static int ns_update_nuse(struct nvme_ns *ns)
+{
+ struct nvme_id_ns *id;
+ int ret;
+
+ /* Avoid issuing commands too often by rate limiting the update. */
+ if (!__ratelimit(&ns->head->rs_nuse))
+ return 0;
+
+ pr_info("%s: ns %p\n", __func__, ns);
+
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
+ if (ret)
+ goto out_free_id;
+
+ ns->head->nuse = le64_to_cpu(id->nuse);
+
+out_free_id:
+ kfree(id);
+
+ return ret;
+}
+
+static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *bdev = disk->part0;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
+ bdev->bd_disk->fops == &nvme_ns_head_ops)
+ ret = ns_head_update_nuse(head);
+ else
+ ret = ns_update_nuse(bdev->bd_disk->private_data);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%llu\n", head->nuse);
+}
+static DEVICE_ATTR_RO(nuse);
+
static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
+ &dev_attr_csi.attr,
&dev_attr_nsid.attr,
+ &dev_attr_metadata_bytes.attr,
+ &dev_attr_nuse.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
--
2.43.0
ns_id, lba_shift and ms are always accessed for every read/write I/O in
nvme_setup_rw. By grouping these variables into one cacheline we can
safe some cycles.
4k sequential reads:
baseline patched
Bandwidth: 1620 1634
IOPs 66345579 66910939
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/nvme.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2b31641a97b9..4a4beaecc832 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -445,21 +445,21 @@ struct nvme_ns_head {
struct list_head list;
struct srcu_struct srcu;
struct nvme_subsystem *subsys;
- unsigned ns_id;
struct nvme_ns_ids ids;
struct list_head entry;
struct kref ref;
bool shared;
int instance;
struct nvme_effects_log *effects;
+ u64 nuse;
+ unsigned ns_id;
int lba_shift;
u16 ms;
u16 pi_size;
- u16 sgs;
- u32 sws;
- u64 nuse;
u8 pi_type;
u8 guard_type;
+ u16 sgs;
+ u32 sws;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
#endif
--
2.43.0
Drop the 'id' part of the attribute group name because we want to expose
non 'id' related attributes via the ns attribute group.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 2 +-
drivers/nvme/host/multipath.c | 2 +-
drivers/nvme/host/nvme.h | 2 +-
drivers/nvme/host/sysfs.c | 14 +++++++-------
4 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 72908e622049..c3270818fa0d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3669,7 +3669,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
up_write(&ctrl->namespaces_rwsem);
nvme_get_ctrl(ctrl);
- if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
+ if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
goto out_cleanup_ns_from_list;
if (!nvme_ns_head_multipath(ns->head))
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..2dd4137a08b2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -579,7 +579,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
*/
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
rc = device_add_disk(&head->subsys->dev, head->disk,
- nvme_ns_id_attr_groups);
+ nvme_ns_attr_groups);
if (rc) {
clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ebe6a9b42c9..32ec7ca30d7c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -870,7 +870,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
-extern const struct attribute_group *nvme_ns_id_attr_groups[];
+extern const struct attribute_group *nvme_ns_attr_groups[];
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index c6b7fbd4d34d..d682d0a667a0 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -114,7 +114,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
-static struct attribute *nvme_ns_id_attrs[] = {
+static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
@@ -127,7 +127,7 @@ static struct attribute *nvme_ns_id_attrs[] = {
NULL,
};
-static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
+static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
@@ -157,13 +157,13 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
return a->mode;
}
-static const struct attribute_group nvme_ns_id_attr_group = {
- .attrs = nvme_ns_id_attrs,
- .is_visible = nvme_ns_id_attrs_are_visible,
+static const struct attribute_group nvme_ns_attr_group = {
+ .attrs = nvme_ns_attrs,
+ .is_visible = nvme_ns_attrs_are_visible,
};
-const struct attribute_group *nvme_ns_id_attr_groups[] = {
- &nvme_ns_id_attr_group,
+const struct attribute_group *nvme_ns_attr_groups[] = {
+ &nvme_ns_attr_group,
NULL,
};
--
2.43.0
On Thu, Dec 07, 2023 at 01:36:23PM +0100, Daniel Wagner wrote:
> +static int ns_head_update_nuse(struct nvme_ns_head *head)
> +{
> + struct nvme_id_ns *id;
> + struct nvme_ns *ns;
> + int srcu_idx, ret = -EWOULDBLOCK;
> +
> + /* Avoid issuing commands too often by rate limiting the update */
> + if (!__ratelimit(&head->rs_nuse))
> + return 0;
> +
> + pr_info("%s: head %p\n", __func__, head);
Forgot to remove this debug print.
> +static int ns_update_nuse(struct nvme_ns *ns)
> +{
> + struct nvme_id_ns *id;
> + int ret;
> +
> + /* Avoid issuing commands too often by rate limiting the update. */
> + if (!__ratelimit(&ns->head->rs_nuse))
> + return 0;
> +
> + pr_info("%s: ns %p\n", __func__, ns);
ditto
Hi Daniel,
this looks generally good to me. A few comments, all but one are
cosmetic.
> @@ -1676,9 +1678,9 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
This an now take the ns_head instead of the ns.
> @@ -1776,11 +1778,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
This can take a ctrl and ns_head instead of the ns.
> @@ -1835,8 +1837,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
This as well after updating nvme_ns_has_pi to take the ns_head.
> @@ -1898,7 +1900,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
> struct nvme_ns *ns, struct nvme_id_ns *id)
This as well after fixing up nvme_lba_to_sect to take the ns_head.
> @@ -2052,7 +2055,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
This one as well.
> @@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
> sizeof(struct nvme_zone_descriptor);
>
> nr_zones = min_t(unsigned int, nr_zones,
> - get_capacity(ns->disk) >> ilog2(ns->zsze));
> + get_capacity(ns->head->disk) >> ilog2(ns->head->zsze));
This doesn't work, as the head disk doesn't exist for !multipath setups.
> @@ -162,7 +163,7 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
This could do with ctrl + ns_head now
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
With the extra debugging removed:
Reviewed-by: Christoph Hellwig <[email protected]>
On Thu, Dec 07, 2023 at 01:36:24PM +0100, Daniel Wagner wrote:
> ns_id, lba_shift and ms are always accessed for every read/write I/O in
> nvme_setup_rw. By grouping these variables into one cacheline we can
> safe some cycles.
>
> 4k sequential reads:
>
> baseline patched
> Bandwidth: 1620 1634
> IOPs 66345579 66910939
Nice:
Reviewed-by: Christoph Hellwig <[email protected]>
On Thu, Dec 07, 2023 at 01:36:21PM +0100, Daniel Wagner wrote:
> @@ -1906,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
> * or smaller than a sector size yet, so catch this early and don't
> * allow block I/O.
> */
> - if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
> + if (ns->head->lba_shift > PAGE_SHIFT ||
> + ns->head->lba_shift < SECTOR_SHIFT) {
> capacity = 0;
> bs = (1 << 9);
> }
A minor conflict here: this series would target nvme-6.8, but the block
tree we're based on doesn't have this code. I'll patch it up for the
current 6.8 tree and make a note of the conflict for the next merge
window.
On Thu, Dec 07, 2023 at 01:36:23PM +0100, Daniel Wagner wrote:
> @@ -3418,6 +3419,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> head->ns_id = info->nsid;
> head->ids = info->ids;
> head->shared = info->is_shared;
> + ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
> kref_init(&head->ref);
I think we need to add:
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
So that we don't get periodic messages like:
[ 60.469730] ns_head_update_nuse: 39 callbacks suppressed
[ 159.532901] ns_head_update_nuse: 1999 callbacks suppressed
Hi Christoph,
On Thu, Dec 07, 2023 at 04:31:38PM +0100, Christoph Hellwig wrote:
> this looks generally good to me. A few comments, all but one are
> cosmetic.
>
> > @@ -1676,9 +1678,9 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
>
> This an now take the ns_head instead of the ns.
Okay
> > @@ -1776,11 +1778,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
>
> This can take a ctrl and ns_head instead of the ns.
Okay
> > @@ -1835,8 +1837,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
>
> This as well after updating nvme_ns_has_pi to take the ns_head.
Okay
> > @@ -1898,7 +1900,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
> > struct nvme_ns *ns, struct nvme_id_ns *id)
>
> This as well after fixing up nvme_lba_to_sect to take the ns_head.
Okay
>
> > @@ -2052,7 +2055,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
>
> This one as well.
When trying to refactor this function I run into a bit of trouble with
dependencies. Basically, we would need to pass through nvme_ctrl,
request_queue, gendisk and nvme_ns_head and still need nvme_ns in
nvme_update_zone_info:
nvme_update_ns_info_block
blk_mq_freeze_queue(ns->disk->queue)
nvme_set_chuck_sector(ns->queue)
nvme_update_zone_info()
set_bit(NVME_NS_FORCE_RO, ns->flags)
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
I suggest we don't refactor this part now. At least it I don't see an
good way to avoid passing down nvme_ns as we still need this data
structure in deeper down the call chain.
> > @@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
> > sizeof(struct nvme_zone_descriptor);
> >
> > nr_zones = min_t(unsigned int, nr_zones,
> > - get_capacity(ns->disk) >> ilog2(ns->zsze));
> > + get_capacity(ns->head->disk) >> ilog2(ns->head->zsze));
>
> This doesn't work, as the head disk doesn't exist for !multipath
> setups.
I was a bit trigger happy here. Changed it back to 'get_capacity(ns->disk)'.
> > @@ -162,7 +163,7 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>
> This could do with ctrl + ns_head now
Okay.
Thanks,
Daniel
On Thu, Dec 07, 2023 at 09:44:58AM -0700, Keith Busch wrote:
> On Thu, Dec 07, 2023 at 01:36:23PM +0100, Daniel Wagner wrote:
> > @@ -3418,6 +3419,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
> > head->ns_id = info->nsid;
> > head->ids = info->ids;
> > head->shared = info->is_shared;
> > + ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
> > kref_init(&head->ref);
>
> I think we need to add:
>
> ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
>
> So that we don't get periodic messages like:
>
> [ 60.469730] ns_head_update_nuse: 39 callbacks suppressed
> [ 159.532901] ns_head_update_nuse: 1999 callbacks suppressed
Good idea. I'll add this.
On Thu, Dec 07, 2023 at 09:31:16AM -0700, Keith Busch wrote:
> On Thu, Dec 07, 2023 at 01:36:21PM +0100, Daniel Wagner wrote:
> > @@ -1906,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
> > * or smaller than a sector size yet, so catch this early and don't
> > * allow block I/O.
> > */
> > - if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
> > + if (ns->head->lba_shift > PAGE_SHIFT ||
> > + ns->head->lba_shift < SECTOR_SHIFT) {
> > capacity = 0;
> > bs = (1 << 9);
> > }
>
> A minor conflict here: this series would target nvme-6.8, but the block
> tree we're based on doesn't have this code. I'll patch it up for the
> current 6.8 tree and make a note of the conflict for the next merge
> window.
I've missed that nvme-6.8 was available now. I can rebase v5 ontop of
nvme-6.8 if you want me to do it. Whatever is simpler for you.