I've added two new refactoring patches which convert a bunch of functions to
accpet nvme_ns_head instead of nvme_ns. Though I've left out the conversion of
nvme_update_ns_info_block because there are still users of nvme_ns deeper down
the callchain.
Besides this I've addressed all comments from v4, remove debug output, silence
ratelimit messages and revert the nvme_ns_head coversion in
nvme_zns_alloc_report_buffer.
The benchmark numbers are still roughly the same and all blktests for all
transports pass with this change. My previous claim in v3 that something is
broken for rdma turned out it's my test that setup was b0rken.
Thanks,
Daniel
libnvme changes:
https://github.com/igaw/libnvme/tree/tree-no-cmd
changes:
v5:
- reverted trigger happy conversion to nvme_ns_head in
nvme_zns_alloc_report_buffer
- removed debug output
- added refactoring patches
- ratelimit silence suppress messages
- added reviewed tags
v4:
- drop 'use nvme_ns_head instead nvme_ns' patches
- ratelimit nuse update per namespace not globally
- rename ns attribute group
- added non-multipath nuse update logic
- added cacheline optimization
- https://lore.kernel.org/linux-nvme/[email protected]/
v3:
- cut overlong lines shorter
- fixed disk (queuedata) initialization order
- more testing with blktest
- added nuse ratelimit
- added reviewed tags
- https://lore.kernel.org/linux-nvme/[email protected]/
v2:
- moved ns id data to nvme_ns_head
- dropped ds, nsze
- https://lore.kernel.org/linux-nvme/[email protected]/
v1:
- initial version
- https://lore.kernel.org/linux-nvme/[email protected]/
Daniel Wagner (6):
nvme: move ns id info to struct nvme_ns_head
nvme: refactor ns info helpers
nvme: refactor ns info setup function
nvme: rename ns attribute group
nvme: add csi, ms and nuse to sysfs
nvme: repack struct nvme_ns_head
drivers/nvme/host/core.c | 168 ++++++++++++++++++----------------
drivers/nvme/host/ioctl.c | 8 +-
drivers/nvme/host/multipath.c | 2 +-
drivers/nvme/host/nvme.h | 44 +++++----
drivers/nvme/host/rdma.c | 4 +-
drivers/nvme/host/sysfs.c | 99 ++++++++++++++++++--
drivers/nvme/host/zns.c | 35 +++----
7 files changed, 233 insertions(+), 127 deletions(-)
--
2.43.0
Move the namesapce info to struct nvme_ns_head, because it's the same
for all associated namespaces.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 81 ++++++++++++++++++++-------------------
drivers/nvme/host/ioctl.c | 8 ++--
drivers/nvme/host/nvme.h | 28 +++++++-------
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/zns.c | 17 ++++----
5 files changed, 70 insertions(+), 66 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d699f0c8b13e..72908e622049 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -312,12 +312,12 @@ static void nvme_log_error(struct request *req)
struct nvme_request *nr = nvme_req(req);
if (ns) {
- pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
+ pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
ns->disk ? ns->disk->disk_name : "?",
nvme_get_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
- (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
- (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
+ nvme_sect_to_lba(ns, blk_rq_pos(req)),
+ blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
nr->status & 0xff, /* Status Code */
@@ -794,7 +794,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
if (queue_max_discard_segments(req->q) == 1) {
u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
- u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
+ u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
range[0].cattr = cpu_to_le32(0);
range[0].nlb = cpu_to_le32(nlb);
@@ -803,7 +803,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
} else {
__rq_for_each_bio(bio, req) {
u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
- u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+ u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
if (n < segments) {
range[n].cattr = cpu_to_le32(0);
@@ -841,7 +841,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
u64 ref48;
/* both rw and write zeroes share the same reftag format */
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
@@ -871,15 +871,16 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.slba =
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
- cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
- if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
+ if (!(req->cmd_flags & REQ_NOUNMAP) &&
+ (ns->head->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
if (nvme_ns_has_pi(ns)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
nvme_set_ref_tag(ns, cmnd, req);
@@ -912,12 +913,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
- cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cmnd->rw.length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
cmnd->rw.apptag = 0;
cmnd->rw.appmask = 0;
- if (ns->ms) {
+ if (ns->head->ms) {
/*
* If formated with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
@@ -930,7 +932,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
@@ -1676,9 +1678,9 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
{
struct blk_integrity integrity = { };
- switch (ns->pi_type) {
+ switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1696,7 +1698,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- switch (ns->guard_type) {
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
@@ -1717,7 +1719,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
}
- integrity.tuple_size = ns->ms;
+ integrity.tuple_size = ns->head->ms;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
@@ -1776,11 +1778,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
int ret = 0;
u32 elbaf;
- ns->pi_size = 0;
- ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ ns->head->pi_size = 0;
+ ns->head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- ns->pi_size = sizeof(struct t10_pi_tuple);
- ns->guard_type = NVME_NVM_NS_16B_GUARD;
+ ns->head->pi_size = sizeof(struct t10_pi_tuple);
+ ns->head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
@@ -1803,13 +1805,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
if (nvme_elbaf_sts(elbaf))
goto free_data;
- ns->guard_type = nvme_elbaf_guard_type(elbaf);
- switch (ns->guard_type) {
+ ns->head->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (ns->head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
- ns->pi_size = sizeof(struct crc64_pi_tuple);
+ ns->head->pi_size = sizeof(struct crc64_pi_tuple);
break;
case NVME_NVM_NS_16B_GUARD:
- ns->pi_size = sizeof(struct t10_pi_tuple);
+ ns->head->pi_size = sizeof(struct t10_pi_tuple);
break;
default:
break;
@@ -1818,10 +1820,10 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
free_data:
kfree(nvm);
set_pi:
- if (ns->pi_size && (first || ns->ms == ns->pi_size))
- ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (ns->head->pi_size && (first || ns->head->ms == ns->head->pi_size))
+ ns->head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
- ns->pi_type = 0;
+ ns->head->pi_type = 0;
return ret;
}
@@ -1835,8 +1837,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (ret)
return ret;
- ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
- if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ ns->head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ if (!ns->head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -1848,7 +1850,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0;
- ns->features |= NVME_NS_EXT_LBAS;
+ ns->head->features |= NVME_NS_EXT_LBAS;
/*
* The current fabrics transport drivers support namespace
@@ -1860,7 +1862,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* gain the ability to use other metadata formats.
*/
if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ ns->head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
* For PCIe controllers, we can't easily remap the separate
@@ -1869,9 +1871,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* We allow extended LBAs for the passthrough interface, though.
*/
if (id->flbas & NVME_NS_FLBAS_META_EXT)
- ns->features |= NVME_NS_EXT_LBAS;
+ ns->head->features |= NVME_NS_EXT_LBAS;
else
- ns->features |= NVME_NS_METADATA_SUPPORTED;
+ ns->head->features |= NVME_NS_METADATA_SUPPORTED;
}
return 0;
}
@@ -1898,7 +1900,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
- u32 bs = 1U << ns->lba_shift;
+ u32 bs = 1U << ns->head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
/*
@@ -1906,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
* or smaller than a sector size yet, so catch this early and don't
* allow block I/O.
*/
- if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
+ if (ns->head->lba_shift > PAGE_SHIFT ||
+ ns->head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9);
}
@@ -1949,9 +1952,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
- if (ns->ms) {
+ if (ns->head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (ns->features & NVME_NS_METADATA_SUPPORTED))
+ (ns->head->features & NVME_NS_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns,
ns->ctrl->max_integrity_segments);
else if (!nvme_ns_has_pi(ns))
@@ -2052,7 +2055,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
- ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
ret = nvme_configure_metadata(ns, id);
@@ -2078,7 +2081,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
* do not return zeroes.
*/
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
- ns->features |= NVME_NS_DEAC;
+ ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 529b9954d2b8..feee9cf50670 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -283,10 +283,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
return -EINVAL;
}
- length = (io.nblocks + 1) << ns->lba_shift;
+ length = (io.nblocks + 1) << ns->head->lba_shift;
if ((io.control & NVME_RW_PRINFO_PRACT) &&
- ns->ms == sizeof(struct t10_pi_tuple)) {
+ ns->head->ms == sizeof(struct t10_pi_tuple)) {
/*
* Protection information is stripped/inserted by the
* controller.
@@ -296,11 +296,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
meta_len = 0;
metadata = NULL;
} else {
- meta_len = (io.nblocks + 1) * ns->ms;
+ meta_len = (io.nblocks + 1) * ns->head->ms;
metadata = nvme_to_user_ptr(io.metadata);
}
- if (ns->features & NVME_NS_EXT_LBAS) {
+ if (ns->head->features & NVME_NS_EXT_LBAS) {
length += meta_len;
meta_len = 0;
} else if (meta_len) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 578e6d311bc9..1ebe6a9b42c9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -451,6 +451,17 @@ struct nvme_ns_head {
bool shared;
int instance;
struct nvme_effects_log *effects;
+ int lba_shift;
+ u16 ms;
+ u16 pi_size;
+ u16 sgs;
+ u32 sws;
+ u8 pi_type;
+ u8 guard_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+ u64 zsze;
+#endif
+ unsigned long features;
struct cdev cdev;
struct device cdev_device;
@@ -492,17 +503,6 @@ struct nvme_ns {
struct kref kref;
struct nvme_ns_head *head;
- int lba_shift;
- u16 ms;
- u16 pi_size;
- u16 sgs;
- u32 sws;
- u8 pi_type;
- u8 guard_type;
-#ifdef CONFIG_BLK_DEV_ZONED
- u64 zsze;
-#endif
- unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
#define NVME_NS_ANA_PENDING 2
@@ -519,7 +519,7 @@ struct nvme_ns {
/* NVMe ns supports metadata actions by the controller (generate/strip) */
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
{
- return ns->pi_type && ns->ms == ns->pi_size;
+ return ns->head->pi_type && ns->head->ms == ns->head->pi_size;
}
struct nvme_ctrl_ops {
@@ -653,7 +653,7 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
*/
static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
{
- return sector >> (ns->lba_shift - SECTOR_SHIFT);
+ return sector >> (ns->head->lba_shift - SECTOR_SHIFT);
}
/*
@@ -661,7 +661,7 @@ static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
*/
static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
{
- return lba << (ns->lba_shift - SECTOR_SHIFT);
+ return lba << (ns->head->lba_shift - SECTOR_SHIFT);
}
/*
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 81e2621169e5..fc0df91e6b36 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1423,7 +1423,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
goto mr_put;
nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
- req->mr->sig_attrs, ns->pi_type);
+ req->mr->sig_attrs, ns->head->pi_type);
nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ec8557810c21..fa9e8f664ae7 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -11,7 +11,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns)
{
struct request_queue *q = ns->queue;
- blk_queue_chunk_sectors(q, ns->zsze);
+ blk_queue_chunk_sectors(q, ns->head->zsze);
blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
return blk_revalidate_disk_zones(ns->disk, NULL);
@@ -99,11 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
- if (!is_power_of_2(ns->zsze)) {
+ ns->head->zsze =
+ nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+ if (!is_power_of_2(ns->head->zsze)) {
dev_warn(ns->ctrl->device,
"invalid zone size:%llu for namespace:%u\n",
- ns->zsze, ns->head->ns_id);
+ ns->head->zsze, ns->head->ns_id);
status = -ENODEV;
goto free_data;
}
@@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
sizeof(struct nvme_zone_descriptor);
nr_zones = min_t(unsigned int, nr_zones,
- get_capacity(ns->disk) >> ilog2(ns->zsze));
+ get_capacity(ns->disk) >> ilog2(ns->head->zsze));
bufsize = sizeof(struct nvme_zone_report) +
nr_zones * sizeof(struct nvme_zone_descriptor);
@@ -162,7 +163,7 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
- zone.len = ns->zsze;
+ zone.len = ns->head->zsze;
zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
if (zone.cond == BLK_ZONE_COND_FULL)
@@ -196,7 +197,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
- sector &= ~(ns->zsze - 1);
+ sector &= ~(ns->head->zsze - 1);
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
memset(report, 0, buflen);
@@ -220,7 +221,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
zone_idx++;
}
- sector += ns->zsze * nz;
+ sector += ns->head->zsze * nz;
}
if (zone_idx > 0)
--
2.43.0
Pass in the nvme_ns_head pointer directly. This reduces the necessity on
the caller side have the nvme_ns data structure present. Thus we can
refactor the caller side in the next step as well.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 36 +++++++++++++++++++++---------------
drivers/nvme/host/nvme.h | 12 ++++++------
drivers/nvme/host/rdma.c | 2 +-
drivers/nvme/host/zns.c | 12 ++++++------
4 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 72908e622049..f0fe41afc958 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -316,7 +316,7 @@ static void nvme_log_error(struct request *req)
ns->disk ? ns->disk->disk_name : "?",
nvme_get_opcode_str(nr->cmd->common.opcode),
nr->cmd->common.opcode,
- nvme_sect_to_lba(ns, blk_rq_pos(req)),
+ nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
blk_rq_bytes(req) >> ns->head->lba_shift,
nvme_get_error_status_str(nr->status),
nr->status >> 8 & 7, /* Status Code Type */
@@ -372,9 +372,12 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
static inline void nvme_end_req_zoned(struct request *req)
{
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ struct nvme_ns *ns = req->q->queuedata;
+
+ req->__sector = nvme_lba_to_sect(ns->head,
le64_to_cpu(nvme_req(req)->result.u64));
+ }
}
static inline void nvme_end_req(struct request *req)
@@ -793,7 +796,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
}
if (queue_max_discard_segments(req->q) == 1) {
- u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
+ u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
range[0].cattr = cpu_to_le32(0);
@@ -802,7 +805,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
n = 1;
} else {
__rq_for_each_bio(bio, req) {
- u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
+ u64 slba = nvme_sect_to_lba(ns->head,
+ bio->bi_iter.bi_sector);
u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
if (n < segments) {
@@ -869,7 +873,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
- cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
@@ -877,7 +881,7 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
(ns->head->features & NVME_NS_DEAC))
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
- if (nvme_ns_has_pi(ns)) {
+ if (nvme_ns_has_pi(ns->head)) {
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
switch (ns->head->pi_type) {
@@ -912,7 +916,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.cdw2 = 0;
cmnd->rw.cdw3 = 0;
cmnd->rw.metadata = 0;
- cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ cmnd->rw.slba =
+ cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
cmnd->rw.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
@@ -927,7 +932,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
* namespace capacity to zero to prevent any I/O.
*/
if (!blk_integrity_rq(req)) {
- if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+ if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
}
@@ -1736,8 +1741,9 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
struct request_queue *queue = disk->queue;
u32 size = queue_logical_block_size(queue);
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
- ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ ctrl->max_discard_sectors =
+ nvme_lba_to_sect(ns->head, ctrl->dmrsl);
if (ctrl->max_discard_sectors == 0) {
blk_queue_max_discard_sectors(queue, 0);
@@ -1861,7 +1867,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* Note, this check will need to be modified if any drivers
* gain the ability to use other metadata formats.
*/
- if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
+ if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns->head))
ns->head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
@@ -1899,7 +1905,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
- sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
+ sector_t capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
u32 bs = 1U << ns->head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
@@ -1957,7 +1963,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
(ns->head->features & NVME_NS_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns,
ns->ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(ns))
+ else if (!nvme_ns_has_pi(ns->head))
capacity = 0;
}
@@ -1988,7 +1994,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
- iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+ iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
if (!iob)
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1ebe6a9b42c9..eb748cc3e897 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -517,9 +517,9 @@ struct nvme_ns {
};
/* NVMe ns supports metadata actions by the controller (generate/strip) */
-static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+static inline bool nvme_ns_has_pi(struct nvme_ns_head *head)
{
- return ns->head->pi_type && ns->head->ms == ns->head->pi_size;
+ return head->pi_type && head->ms == head->pi_size;
}
struct nvme_ctrl_ops {
@@ -651,17 +651,17 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
/*
* Convert a 512B sector number to a device logical block number.
*/
-static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
+static inline u64 nvme_sect_to_lba(struct nvme_ns_head *head, sector_t sector)
{
- return sector >> (ns->head->lba_shift - SECTOR_SHIFT);
+ return sector >> (head->lba_shift - SECTOR_SHIFT);
}
/*
* Convert a device logical block number to a 512B sector number.
*/
-static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
+static inline sector_t nvme_lba_to_sect(struct nvme_ns_head *head, u64 lba)
{
- return lba << (ns->head->lba_shift - SECTOR_SHIFT);
+ return lba << (head->lba_shift - SECTOR_SHIFT);
}
/*
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index fc0df91e6b36..c89503da24d7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2017,7 +2017,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
queue->pi_support &&
(c->common.opcode == nvme_cmd_write ||
c->common.opcode == nvme_cmd_read) &&
- nvme_ns_has_pi(ns))
+ nvme_ns_has_pi(ns->head))
req->use_sig_mr = true;
else
req->use_sig_mr = false;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index fa9e8f664ae7..ded52ab05424 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -100,7 +100,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
}
ns->head->zsze =
- nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+ nvme_lba_to_sect(ns->head, le64_to_cpu(id->lbafe[lbaf].zsze));
if (!is_power_of_2(ns->head->zsze)) {
dev_warn(ns->ctrl->device,
"invalid zone size:%llu for namespace:%u\n",
@@ -164,12 +164,12 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
zone.len = ns->head->zsze;
- zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
- zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+ zone.capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->zcap));
+ zone.start = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->zslba));
if (zone.cond == BLK_ZONE_COND_FULL)
zone.wp = zone.start + zone.len;
else
- zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+ zone.wp = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->wp));
return cb(&zone, idx, data);
}
@@ -201,7 +201,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
memset(report, 0, buflen);
- c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+ c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, sector));
ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
if (ret) {
if (ret > 0)
@@ -240,7 +240,7 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
c->zms.opcode = nvme_cmd_zone_mgmt_send;
c->zms.nsid = cpu_to_le32(ns->head->ns_id);
- c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+ c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
c->zms.zsa = action;
if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
--
2.43.0
Drop the 'id' part of the attribute group name because we want to expose
non 'id' related attributes via the ns attribute group.
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 2 +-
drivers/nvme/host/multipath.c | 2 +-
drivers/nvme/host/nvme.h | 2 +-
drivers/nvme/host/sysfs.c | 14 +++++++-------
4 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2468076d64c6..c537914b75e3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3674,7 +3674,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
up_write(&ctrl->namespaces_rwsem);
nvme_get_ctrl(ctrl);
- if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
+ if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
goto out_cleanup_ns_from_list;
if (!nvme_ns_head_multipath(ns->head))
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..2dd4137a08b2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -579,7 +579,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
*/
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
rc = device_add_disk(&head->subsys->dev, head->disk,
- nvme_ns_id_attr_groups);
+ nvme_ns_attr_groups);
if (rc) {
clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index eb748cc3e897..b783b37328fa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -870,7 +870,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
-extern const struct attribute_group *nvme_ns_id_attr_groups[];
+extern const struct attribute_group *nvme_ns_attr_groups[];
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index c6b7fbd4d34d..d682d0a667a0 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -114,7 +114,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
-static struct attribute *nvme_ns_id_attrs[] = {
+static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
@@ -127,7 +127,7 @@ static struct attribute *nvme_ns_id_attrs[] = {
NULL,
};
-static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
+static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
@@ -157,13 +157,13 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
return a->mode;
}
-static const struct attribute_group nvme_ns_id_attr_group = {
- .attrs = nvme_ns_id_attrs,
- .is_visible = nvme_ns_id_attrs_are_visible,
+static const struct attribute_group nvme_ns_attr_group = {
+ .attrs = nvme_ns_attrs,
+ .is_visible = nvme_ns_attrs_are_visible,
};
-const struct attribute_group *nvme_ns_id_attr_groups[] = {
- &nvme_ns_id_attr_group,
+const struct attribute_group *nvme_ns_attr_groups[] = {
+ &nvme_ns_attr_group,
NULL,
};
--
2.43.0
libnvme is using the sysfs for enumarating the nvme resources. Though
there are few missing attritbutes in the sysfs. For these libnvme issues
commands during discovering.
As the kernel already knows all these attributes and we would like to
avoid libnvme to issue commands all the time, expose these missing
attributes.
The nuse value is updated on request because the nuse is a volatile
value. Since any user can read the sysfs attribute, a very simple rate
limit is added (update once every 5 seconds). A more sophisticated
update strategy can be added later if there is actually a need for it.
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 6 ++-
drivers/nvme/host/nvme.h | 6 +++
drivers/nvme/host/sysfs.c | 85 +++++++++++++++++++++++++++++++++++++++
3 files changed, 96 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c537914b75e3..3138cbb3b380 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -20,6 +20,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
+#include <linux/ratelimit.h>
#include <asm/unaligned.h>
#include "nvme.h"
@@ -1459,7 +1460,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
return status;
}
-static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_id_ns **id)
{
struct nvme_command c = { };
@@ -2061,6 +2062,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
+ ns->head->nuse = le64_to_cpu(id->nuse);
nvme_set_queue_limits(ns->ctrl, ns->queue);
ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
@@ -3423,6 +3425,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
+ ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
if (head->ids.csi) {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b783b37328fa..8912292720d4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
#include <linux/rcupdate.h>
#include <linux/wait.h>
#include <linux/t10-pi.h>
+#include <linux/ratelimit_types.h>
#include <trace/events/block.h>
@@ -456,6 +457,7 @@ struct nvme_ns_head {
u16 pi_size;
u16 sgs;
u32 sws;
+ u64 nuse;
u8 pi_type;
u8 guard_type;
#ifdef CONFIG_BLK_DEV_ZONED
@@ -463,6 +465,8 @@ struct nvme_ns_head {
#endif
unsigned long features;
+ struct ratelimit_state rs_nuse;
+
struct cdev cdev;
struct device cdev_device;
@@ -867,6 +871,8 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
+int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+ struct nvme_id_ns **id);
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index d682d0a667a0..3cfae0c3af76 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -114,12 +114,97 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
+static ssize_t csi_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ids.csi);
+}
+static DEVICE_ATTR_RO(csi);
+
+static ssize_t metadata_bytes_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ms);
+}
+static DEVICE_ATTR_RO(metadata_bytes);
+
+static int ns_head_update_nuse(struct nvme_ns_head *head)
+{
+ struct nvme_id_ns *id;
+ struct nvme_ns *ns;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ /* Avoid issuing commands too often by rate limiting the update */
+ if (!__ratelimit(&head->rs_nuse))
+ return 0;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ ns = nvme_find_path(head);
+ if (!ns)
+ goto out_unlock;
+
+ ret = nvme_identify_ns(ns->ctrl, head->ns_id, &id);
+ if (ret)
+ goto out_unlock;
+
+ head->nuse = le64_to_cpu(id->nuse);
+ kfree(id);
+
+out_unlock:
+ srcu_read_unlock(&head->srcu, srcu_idx);
+ return ret;
+}
+
+static int ns_update_nuse(struct nvme_ns *ns)
+{
+ struct nvme_id_ns *id;
+ int ret;
+
+ /* Avoid issuing commands too often by rate limiting the update. */
+ if (!__ratelimit(&ns->head->rs_nuse))
+ return 0;
+
+ ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
+ if (ret)
+ goto out_free_id;
+
+ ns->head->nuse = le64_to_cpu(id->nuse);
+
+out_free_id:
+ kfree(id);
+
+ return ret;
+}
+
+static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *bdev = disk->part0;
+ int ret;
+
+ if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
+ bdev->bd_disk->fops == &nvme_ns_head_ops)
+ ret = ns_head_update_nuse(head);
+ else
+ ret = ns_update_nuse(bdev->bd_disk->private_data);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%llu\n", head->nuse);
+}
+static DEVICE_ATTR_RO(nuse);
+
static struct attribute *nvme_ns_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
+ &dev_attr_csi.attr,
&dev_attr_nsid.attr,
+ &dev_attr_metadata_bytes.attr,
+ &dev_attr_nuse.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
--
2.43.0
ns_id, lba_shift and ms are always accessed for every read/write I/O in
nvme_setup_rw. By grouping these variables into one cacheline we can
safe some cycles.
4k sequential reads:
baseline patched
Bandwidth: 1620 1634
IOPs 66345579 66910939
Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/nvme.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8912292720d4..416794727364 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -445,21 +445,21 @@ struct nvme_ns_head {
struct list_head list;
struct srcu_struct srcu;
struct nvme_subsystem *subsys;
- unsigned ns_id;
struct nvme_ns_ids ids;
struct list_head entry;
struct kref ref;
bool shared;
int instance;
struct nvme_effects_log *effects;
+ u64 nuse;
+ unsigned ns_id;
int lba_shift;
u16 ms;
u16 pi_size;
- u16 sgs;
- u32 sws;
- u64 nuse;
u8 pi_type;
u8 guard_type;
+ u16 sgs;
+ u32 sws;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
#endif
--
2.43.0
Use nvme_ns_head instead of nvme_ns where possible. This reduces the
coupling between the different data structures.
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/core.c | 107 +++++++++++++++++++--------------------
drivers/nvme/host/zns.c | 16 +++---
2 files changed, 62 insertions(+), 61 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f0fe41afc958..2468076d64c6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1678,14 +1678,14 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
+static void nvme_init_integrity(struct gendisk *disk,
+ struct nvme_ns_head *head, u32 max_integrity_segments)
{
struct blk_integrity integrity = { };
- switch (ns->head->pi_type) {
+ switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
- switch (ns->head->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1703,7 +1703,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
- switch (ns->head->guard_type) {
+ switch (head->guard_type) {
case NVME_NVM_NS_16B_GUARD:
integrity.profile = &t10_pi_type1_crc;
integrity.tag_size = sizeof(u16);
@@ -1724,26 +1724,26 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
break;
}
- integrity.tuple_size = ns->head->ms;
+ integrity.tuple_size = head->ms;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
}
#else
-static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
- u32 max_integrity_segments)
+static void nvme_init_integrity(struct gendisk *disk,
+ struct nvme_ns_head *head, u32 max_integrity_segments)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
+ struct nvme_ns_head *head)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
struct request_queue *queue = disk->queue;
u32 size = queue_logical_block_size(queue);
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
ctrl->max_discard_sectors =
- nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+ nvme_lba_to_sect(head, ctrl->dmrsl);
if (ctrl->max_discard_sectors == 0) {
blk_queue_max_discard_sectors(queue, 0);
@@ -1774,21 +1774,21 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
+ struct nvme_id_ns *id)
{
bool first = id->dps & NVME_NS_DPS_PI_FIRST;
unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_command c = { };
struct nvme_id_ns_nvm *nvm;
int ret = 0;
u32 elbaf;
- ns->head->pi_size = 0;
- ns->head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ head->pi_size = 0;
+ head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- ns->head->pi_size = sizeof(struct t10_pi_tuple);
- ns->head->guard_type = NVME_NVM_NS_16B_GUARD;
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
@@ -1797,11 +1797,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+ c.identify.nsid = cpu_to_le32(head->ns_id);
c.identify.cns = NVME_ID_CNS_CS_NS;
c.identify.csi = NVME_CSI_NVM;
- ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
goto free_data;
@@ -1811,13 +1811,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
if (nvme_elbaf_sts(elbaf))
goto free_data;
- ns->head->guard_type = nvme_elbaf_guard_type(elbaf);
- switch (ns->head->guard_type) {
+ head->guard_type = nvme_elbaf_guard_type(elbaf);
+ switch (head->guard_type) {
case NVME_NVM_NS_64B_GUARD:
- ns->head->pi_size = sizeof(struct crc64_pi_tuple);
+ head->pi_size = sizeof(struct crc64_pi_tuple);
break;
case NVME_NVM_NS_16B_GUARD:
- ns->head->pi_size = sizeof(struct t10_pi_tuple);
+ head->pi_size = sizeof(struct t10_pi_tuple);
break;
default:
break;
@@ -1826,25 +1826,25 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
free_data:
kfree(nvm);
set_pi:
- if (ns->head->pi_size && (first || ns->head->ms == ns->head->pi_size))
- ns->head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (head->pi_size && (first || head->ms == head->pi_size))
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
- ns->head->pi_type = 0;
+ head->pi_type = 0;
return ret;
}
-static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
- ret = nvme_init_ms(ns, id);
+ ret = nvme_init_ms(ctrl, head, id);
if (ret)
return ret;
- ns->head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
- if (!ns->head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
if (ctrl->ops->flags & NVME_F_FABRICS) {
@@ -1856,7 +1856,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0;
- ns->head->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
/*
* The current fabrics transport drivers support namespace
@@ -1867,8 +1867,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* Note, this check will need to be modified if any drivers
* gain the ability to use other metadata formats.
*/
- if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns->head))
- ns->head->features |= NVME_NS_METADATA_SUPPORTED;
+ if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
+ head->features |= NVME_NS_METADATA_SUPPORTED;
} else {
/*
* For PCIe controllers, we can't easily remap the separate
@@ -1877,9 +1877,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
* We allow extended LBAs for the passthrough interface, though.
*/
if (id->flbas & NVME_NS_FLBAS_META_EXT)
- ns->head->features |= NVME_NS_EXT_LBAS;
+ head->features |= NVME_NS_EXT_LBAS;
else
- ns->head->features |= NVME_NS_METADATA_SUPPORTED;
+ head->features |= NVME_NS_METADATA_SUPPORTED;
}
return 0;
}
@@ -1902,11 +1902,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
blk_queue_write_cache(q, vwc, vwc);
}
-static void nvme_update_disk_info(struct gendisk *disk,
- struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
+ struct nvme_ns_head *head, struct nvme_id_ns *id)
{
- sector_t capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
- u32 bs = 1U << ns->head->lba_shift;
+ sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
+ u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
/*
@@ -1914,8 +1914,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
* or smaller than a sector size yet, so catch this early and don't
* allow block I/O.
*/
- if (ns->head->lba_shift > PAGE_SHIFT ||
- ns->head->lba_shift < SECTOR_SHIFT) {
+ if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9);
}
@@ -1932,7 +1931,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ctrl->subsys->awupf) * bs;
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@@ -1958,20 +1957,20 @@ static void nvme_update_disk_info(struct gendisk *disk,
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
- if (ns->head->ms) {
+ if (head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (ns->head->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, ns,
- ns->ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(ns->head))
+ (head->features & NVME_NS_METADATA_SUPPORTED))
+ nvme_init_integrity(disk, head,
+ ctrl->max_integrity_segments);
+ else if (!nvme_ns_has_pi(head))
capacity = 0;
}
set_capacity_and_notify(disk, capacity);
- nvme_config_discard(disk, ns);
+ nvme_config_discard(ctrl, disk, head);
blk_queue_max_write_zeroes_sectors(disk->queue,
- ns->ctrl->max_zeroes_sectors);
+ ctrl->max_zeroes_sectors);
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -2064,13 +2063,13 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
ns->head->lba_shift = id->lbaf[lbaf].ds;
nvme_set_queue_limits(ns->ctrl, ns->queue);
- ret = nvme_configure_metadata(ns, id);
+ ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
if (ret < 0) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->disk, ns, id);
+ nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
@@ -2100,7 +2099,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->head->disk, ns, id);
+ nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index ded52ab05424..56b27aabcad9 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -148,7 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
return NULL;
}
-static int nvme_zone_parse_entry(struct nvme_ns *ns,
+static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head,
struct nvme_zone_descriptor *entry,
unsigned int idx, report_zones_cb cb,
void *data)
@@ -156,20 +157,20 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct blk_zone zone = { };
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
- dev_err(ns->ctrl->device, "invalid zone type %#x\n",
+ dev_err(ctrl->device, "invalid zone type %#x\n",
entry->zt);
return -EINVAL;
}
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
- zone.len = ns->head->zsze;
- zone.capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->zcap));
- zone.start = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->zslba));
+ zone.len = head->zsze;
+ zone.capacity = nvme_lba_to_sect(head, le64_to_cpu(entry->zcap));
+ zone.start = nvme_lba_to_sect(head, le64_to_cpu(entry->zslba));
if (zone.cond == BLK_ZONE_COND_FULL)
zone.wp = zone.start + zone.len;
else
- zone.wp = nvme_lba_to_sect(ns->head, le64_to_cpu(entry->wp));
+ zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
return cb(&zone, idx, data);
}
@@ -214,7 +215,8 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
break;
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
- ret = nvme_zone_parse_entry(ns, &report->entries[i],
+ ret = nvme_zone_parse_entry(ns->ctrl, ns->head,
+ &report->entries[i],
zone_idx, cb, data);
if (ret)
goto out_free;
--
2.43.0
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
> diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
> index d682d0a667a0..3cfae0c3af76 100644
> --- a/drivers/nvme/host/sysfs.c
> +++ b/drivers/nvme/host/sysfs.c
> @@ -114,12 +114,97 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
> }
> static DEVICE_ATTR_RO(nsid);
>
> +static ssize_t csi_show(struct device *dev, struct device_attribute *attr,
> + char *buf)
> +{
> + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ids.csi);
> +}
> +static DEVICE_ATTR_RO(csi);
> +
> +static ssize_t metadata_bytes_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ms);
> +}
> +static DEVICE_ATTR_RO(metadata_bytes);
> +
>
do we need to change the format specifier ? since csi (u8) and ms (u16)
are not declared as signed integers ...
-ck
On 12/8/23 12:53, Daniel Wagner wrote:
> Move the namesapce info to struct nvme_ns_head, because it's the same
> for all associated namespaces.
>
> Signed-off-by: Daniel Wagner <[email protected]>
> ---
> drivers/nvme/host/core.c | 81 ++++++++++++++++++++-------------------
> drivers/nvme/host/ioctl.c | 8 ++--
> drivers/nvme/host/nvme.h | 28 +++++++-------
> drivers/nvme/host/rdma.c | 2 +-
> drivers/nvme/host/zns.c | 17 ++++----
> 5 files changed, 70 insertions(+), 66 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index d699f0c8b13e..72908e622049 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -312,12 +312,12 @@ static void nvme_log_error(struct request *req)
> struct nvme_request *nr = nvme_req(req);
>
> if (ns) {
> - pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
> + pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
> ns->disk ? ns->disk->disk_name : "?",
> nvme_get_opcode_str(nr->cmd->common.opcode),
> nr->cmd->common.opcode,
> - (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
> - (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
> + nvme_sect_to_lba(ns, blk_rq_pos(req)),
> + blk_rq_bytes(req) >> ns->head->lba_shift,
> nvme_get_error_status_str(nr->status),
> nr->status >> 8 & 7, /* Status Code Type */
> nr->status & 0xff, /* Status Code */
> @@ -794,7 +794,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
>
> if (queue_max_discard_segments(req->q) == 1) {
> u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
> - u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
> + u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
>
> range[0].cattr = cpu_to_le32(0);
> range[0].nlb = cpu_to_le32(nlb);
> @@ -803,7 +803,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
> } else {
> __rq_for_each_bio(bio, req) {
> u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
> - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
> + u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
>
> if (n < segments) {
> range[n].cattr = cpu_to_le32(0);
> @@ -841,7 +841,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
> u64 ref48;
>
> /* both rw and write zeroes share the same reftag format */
> - switch (ns->guard_type) {
> + switch (ns->head->guard_type) {
I think that the whole PI stuff needs to be taken with a bit more
consideration because if not all paths agree on the pi (as we have
hbas with fabrics) we can't just override or do a logical or on
the capabilities/attributes.
> case NVME_NVM_NS_16B_GUARD:
> cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
> break;
> @@ -871,15 +871,16 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
> cmnd->write_zeroes.slba =
> cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> cmnd->write_zeroes.length =
> - cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
> + cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
>
> - if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
> + if (!(req->cmd_flags & REQ_NOUNMAP) &&
> + (ns->head->features & NVME_NS_DEAC))
> cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
>
> if (nvme_ns_has_pi(ns)) {
> cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
>
> - switch (ns->pi_type) {
> + switch (ns->head->pi_type) {
> case NVME_NS_DPS_PI_TYPE1:
> case NVME_NS_DPS_PI_TYPE2:
> nvme_set_ref_tag(ns, cmnd, req);
> @@ -912,12 +913,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> cmnd->rw.cdw3 = 0;
> cmnd->rw.metadata = 0;
> cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
> - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
> + cmnd->rw.length =
> + cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
> cmnd->rw.reftag = 0;
> cmnd->rw.apptag = 0;
> cmnd->rw.appmask = 0;
>
> - if (ns->ms) {
> + if (ns->head->ms) {
> /*
> * If formated with metadata, the block layer always provides a
> * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
> @@ -930,7 +932,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
> control |= NVME_RW_PRINFO_PRACT;
> }
>
> - switch (ns->pi_type) {
> + switch (ns->head->pi_type) {
> case NVME_NS_DPS_PI_TYPE3:
> control |= NVME_RW_PRINFO_PRCHK_GUARD;
> break;
> @@ -1676,9 +1678,9 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
> {
> struct blk_integrity integrity = { };
>
> - switch (ns->pi_type) {
> + switch (ns->head->pi_type) {
> case NVME_NS_DPS_PI_TYPE3:
> - switch (ns->guard_type) {
> + switch (ns->head->guard_type) {
> case NVME_NVM_NS_16B_GUARD:
> integrity.profile = &t10_pi_type3_crc;
> integrity.tag_size = sizeof(u16) + sizeof(u32);
> @@ -1696,7 +1698,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
> break;
> case NVME_NS_DPS_PI_TYPE1:
> case NVME_NS_DPS_PI_TYPE2:
> - switch (ns->guard_type) {
> + switch (ns->head->guard_type) {
> case NVME_NVM_NS_16B_GUARD:
> integrity.profile = &t10_pi_type1_crc;
> integrity.tag_size = sizeof(u16);
> @@ -1717,7 +1719,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
> break;
> }
>
> - integrity.tuple_size = ns->ms;
> + integrity.tuple_size = ns->head->ms;
> blk_integrity_register(disk, &integrity);
> blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
> }
> @@ -1776,11 +1778,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
> int ret = 0;
> u32 elbaf;
>
> - ns->pi_size = 0;
> - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> + ns->head->pi_size = 0;
> + ns->head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
> - ns->pi_size = sizeof(struct t10_pi_tuple);
> - ns->guard_type = NVME_NVM_NS_16B_GUARD;
> + ns->head->pi_size = sizeof(struct t10_pi_tuple);
> + ns->head->guard_type = NVME_NVM_NS_16B_GUARD;
> goto set_pi;
> }
>
> @@ -1803,13 +1805,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
> if (nvme_elbaf_sts(elbaf))
> goto free_data;
>
> - ns->guard_type = nvme_elbaf_guard_type(elbaf);
> - switch (ns->guard_type) {
> + ns->head->guard_type = nvme_elbaf_guard_type(elbaf);
> + switch (ns->head->guard_type) {
> case NVME_NVM_NS_64B_GUARD:
> - ns->pi_size = sizeof(struct crc64_pi_tuple);
> + ns->head->pi_size = sizeof(struct crc64_pi_tuple);
> break;
> case NVME_NVM_NS_16B_GUARD:
> - ns->pi_size = sizeof(struct t10_pi_tuple);
> + ns->head->pi_size = sizeof(struct t10_pi_tuple);
> break;
> default:
> break;
> @@ -1818,10 +1820,10 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
> free_data:
> kfree(nvm);
> set_pi:
> - if (ns->pi_size && (first || ns->ms == ns->pi_size))
> - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
> + if (ns->head->pi_size && (first || ns->head->ms == ns->head->pi_size))
> + ns->head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
> else
> - ns->pi_type = 0;
> + ns->head->pi_type = 0;
>
> return ret;
> }
> @@ -1835,8 +1837,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
> if (ret)
> return ret;
>
> - ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
> - if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
> + ns->head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
> + if (!ns->head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
> return 0;
>
> if (ctrl->ops->flags & NVME_F_FABRICS) {
> @@ -1848,7 +1850,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
> if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
> return 0;
>
> - ns->features |= NVME_NS_EXT_LBAS;
> + ns->head->features |= NVME_NS_EXT_LBAS;
>
> /*
> * The current fabrics transport drivers support namespace
> @@ -1860,7 +1862,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
> * gain the ability to use other metadata formats.
> */
> if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
> - ns->features |= NVME_NS_METADATA_SUPPORTED;
> + ns->head->features |= NVME_NS_METADATA_SUPPORTED;
> } else {
> /*
> * For PCIe controllers, we can't easily remap the separate
> @@ -1869,9 +1871,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
> * We allow extended LBAs for the passthrough interface, though.
> */
> if (id->flbas & NVME_NS_FLBAS_META_EXT)
> - ns->features |= NVME_NS_EXT_LBAS;
> + ns->head->features |= NVME_NS_EXT_LBAS;
> else
> - ns->features |= NVME_NS_METADATA_SUPPORTED;
> + ns->head->features |= NVME_NS_METADATA_SUPPORTED;
> }
> return 0;
> }
> @@ -1898,7 +1900,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
> struct nvme_ns *ns, struct nvme_id_ns *id)
> {
> sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
> - u32 bs = 1U << ns->lba_shift;
> + u32 bs = 1U << ns->head->lba_shift;
> u32 atomic_bs, phys_bs, io_opt = 0;
>
> /*
> @@ -1906,7 +1908,8 @@ static void nvme_update_disk_info(struct gendisk *disk,
> * or smaller than a sector size yet, so catch this early and don't
> * allow block I/O.
> */
> - if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
> + if (ns->head->lba_shift > PAGE_SHIFT ||
> + ns->head->lba_shift < SECTOR_SHIFT) {
> capacity = 0;
> bs = (1 << 9);
> }
> @@ -1949,9 +1952,9 @@ static void nvme_update_disk_info(struct gendisk *disk,
> * I/O to namespaces with metadata except when the namespace supports
> * PI, as it can strip/insert in that case.
> */
> - if (ns->ms) {
> + if (ns->head->ms) {
> if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
> - (ns->features & NVME_NS_METADATA_SUPPORTED))
> + (ns->head->features & NVME_NS_METADATA_SUPPORTED))
> nvme_init_integrity(disk, ns,
> ns->ctrl->max_integrity_segments);
> else if (!nvme_ns_has_pi(ns))
> @@ -2052,7 +2055,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
>
> blk_mq_freeze_queue(ns->disk->queue);
> lbaf = nvme_lbaf_index(id->flbas);
> - ns->lba_shift = id->lbaf[lbaf].ds;
> + ns->head->lba_shift = id->lbaf[lbaf].ds;
> nvme_set_queue_limits(ns->ctrl, ns->queue);
>
> ret = nvme_configure_metadata(ns, id);
> @@ -2078,7 +2081,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
> * do not return zeroes.
> */
> if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
> - ns->features |= NVME_NS_DEAC;
> + ns->head->features |= NVME_NS_DEAC;
> set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
> set_bit(NVME_NS_READY, &ns->flags);
> blk_mq_unfreeze_queue(ns->disk->queue);
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 529b9954d2b8..feee9cf50670 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -283,10 +283,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
> return -EINVAL;
> }
>
> - length = (io.nblocks + 1) << ns->lba_shift;
> + length = (io.nblocks + 1) << ns->head->lba_shift;
>
> if ((io.control & NVME_RW_PRINFO_PRACT) &&
> - ns->ms == sizeof(struct t10_pi_tuple)) {
> + ns->head->ms == sizeof(struct t10_pi_tuple)) {
> /*
> * Protection information is stripped/inserted by the
> * controller.
> @@ -296,11 +296,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
> meta_len = 0;
> metadata = NULL;
> } else {
> - meta_len = (io.nblocks + 1) * ns->ms;
> + meta_len = (io.nblocks + 1) * ns->head->ms;
> metadata = nvme_to_user_ptr(io.metadata);
> }
>
> - if (ns->features & NVME_NS_EXT_LBAS) {
> + if (ns->head->features & NVME_NS_EXT_LBAS) {
> length += meta_len;
> meta_len = 0;
> } else if (meta_len) {
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 578e6d311bc9..1ebe6a9b42c9 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -451,6 +451,17 @@ struct nvme_ns_head {
> bool shared;
> int instance;
> struct nvme_effects_log *effects;
> + int lba_shift;
> + u16 ms;
> + u16 pi_size;
> + u16 sgs;
> + u32 sws;
> + u8 pi_type;
> + u8 guard_type;
> +#ifdef CONFIG_BLK_DEV_ZONED
> + u64 zsze;
> +#endif
> + unsigned long features;
>
> struct cdev cdev;
> struct device cdev_device;
> @@ -492,17 +503,6 @@ struct nvme_ns {
> struct kref kref;
> struct nvme_ns_head *head;
>
> - int lba_shift;
> - u16 ms;
> - u16 pi_size;
> - u16 sgs;
> - u32 sws;
> - u8 pi_type;
> - u8 guard_type;
> -#ifdef CONFIG_BLK_DEV_ZONED
> - u64 zsze;
> -#endif
> - unsigned long features;
> unsigned long flags;
> #define NVME_NS_REMOVING 0
> #define NVME_NS_ANA_PENDING 2
> @@ -519,7 +519,7 @@ struct nvme_ns {
> /* NVMe ns supports metadata actions by the controller (generate/strip) */
> static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
> {
> - return ns->pi_type && ns->ms == ns->pi_size;
> + return ns->head->pi_type && ns->head->ms == ns->head->pi_size;
> }
>
> struct nvme_ctrl_ops {
> @@ -653,7 +653,7 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
> */
> static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
> {
> - return sector >> (ns->lba_shift - SECTOR_SHIFT);
> + return sector >> (ns->head->lba_shift - SECTOR_SHIFT);
> }
>
> /*
> @@ -661,7 +661,7 @@ static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
> */
> static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
> {
> - return lba << (ns->lba_shift - SECTOR_SHIFT);
> + return lba << (ns->head->lba_shift - SECTOR_SHIFT);
> }
>
> /*
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 81e2621169e5..fc0df91e6b36 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1423,7 +1423,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
> goto mr_put;
>
> nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
> - req->mr->sig_attrs, ns->pi_type);
> + req->mr->sig_attrs, ns->head->pi_type);
> nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
>
> ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
> index ec8557810c21..fa9e8f664ae7 100644
> --- a/drivers/nvme/host/zns.c
> +++ b/drivers/nvme/host/zns.c
> @@ -11,7 +11,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns)
> {
> struct request_queue *q = ns->queue;
>
> - blk_queue_chunk_sectors(q, ns->zsze);
> + blk_queue_chunk_sectors(q, ns->head->zsze);
> blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
>
> return blk_revalidate_disk_zones(ns->disk, NULL);
> @@ -99,11 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
> goto free_data;
> }
>
> - ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
> - if (!is_power_of_2(ns->zsze)) {
> + ns->head->zsze =
> + nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
> + if (!is_power_of_2(ns->head->zsze)) {
> dev_warn(ns->ctrl->device,
> "invalid zone size:%llu for namespace:%u\n",
> - ns->zsze, ns->head->ns_id);
> + ns->head->zsze, ns->head->ns_id);
> status = -ENODEV;
> goto free_data;
> }
> @@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
> sizeof(struct nvme_zone_descriptor);
>
> nr_zones = min_t(unsigned int, nr_zones,
> - get_capacity(ns->disk) >> ilog2(ns->zsze));
> + get_capacity(ns->disk) >> ilog2(ns->head->zsze));
>
> bufsize = sizeof(struct nvme_zone_report) +
> nr_zones * sizeof(struct nvme_zone_descriptor);
> @@ -162,7 +163,7 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
>
> zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
> zone.cond = entry->zs >> 4;
> - zone.len = ns->zsze;
> + zone.len = ns->head->zsze;
> zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
> zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
> if (zone.cond == BLK_ZONE_COND_FULL)
> @@ -196,7 +197,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
> c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
>
> - sector &= ~(ns->zsze - 1);
> + sector &= ~(ns->head->zsze - 1);
> while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
> memset(report, 0, buflen);
>
> @@ -220,7 +221,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
> zone_idx++;
> }
>
> - sector += ns->zsze * nz;
> + sector += ns->head->zsze * nz;
> }
>
> if (zone_idx > 0)
Same comment on PI.
Other than that,
Reviewed-by: Sagi Grimberg <[email protected]>
Same on the PI comment.
Other than that,
Reviewed-by: Sagi Grimberg <[email protected]>
Reviewed-by: Sagi Grimberg <[email protected]>
Reviewed-by: Sagi Grimberg <[email protected]>
Reviewed-by: Sagi Grimberg <[email protected]>
On Wed, Dec 13, 2023 at 03:27:12PM +0200, Sagi Grimberg wrote:
> > @@ -841,7 +841,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
> > u64 ref48;
> > /* both rw and write zeroes share the same reftag format */
> > - switch (ns->guard_type) {
> > + switch (ns->head->guard_type) {
>
> I think that the whole PI stuff needs to be taken with a bit more
> consideration because if not all paths agree on the pi (as we have
> hbas with fabrics) we can't just override or do a logical or on
> the capabilities/attributes.
So should the PI variables stay in nvme_ns at this point? Or should I
add some checks which avoid an override and warn in this case?
On Wed, Dec 13, 2023 at 03:54:25PM +0100, Daniel Wagner wrote:
> > I think that the whole PI stuff needs to be taken with a bit more
> > consideration because if not all paths agree on the pi (as we have
> > hbas with fabrics) we can't just override or do a logical or on
> > the capabilities/attributes.
>
> So should the PI variables stay in nvme_ns at this point? Or should I
> add some checks which avoid an override and warn in this case?
Didn't we merge the patch from max to require uniform PI setups
for all controllers that we're using in a multipath setup? I'll
check the code after finishing a few more things if no one remembers
offhand.
On Wed, Dec 13, 2023 at 04:38:34PM +0100, Christoph Hellwig wrote:
> On Wed, Dec 13, 2023 at 03:54:25PM +0100, Daniel Wagner wrote:
> > > I think that the whole PI stuff needs to be taken with a bit more
> > > consideration because if not all paths agree on the pi (as we have
> > > hbas with fabrics) we can't just override or do a logical or on
> > > the capabilities/attributes.
> >
> > So should the PI variables stay in nvme_ns at this point? Or should I
> > add some checks which avoid an override and warn in this case?
>
> Didn't we merge the patch from max to require uniform PI setups
> for all controllers that we're using in a multipath setup? I'll
> check the code after finishing a few more things if no one remembers
> offhand.
The newest discussion on this topic I found was this:
https://lore.kernel.org/linux-nvme/[email protected]/
On 13/12/2023 17:38, Christoph Hellwig wrote:
> On Wed, Dec 13, 2023 at 03:54:25PM +0100, Daniel Wagner wrote:
>>> I think that the whole PI stuff needs to be taken with a bit more
>>> consideration because if not all paths agree on the pi (as we have
>>> hbas with fabrics) we can't just override or do a logical or on
>>> the capabilities/attributes.
>>
>> So should the PI variables stay in nvme_ns at this point? Or should I
>> add some checks which avoid an override and warn in this case?
>
> Didn't we merge the patch from max to require uniform PI setups
> for all controllers that we're using in a multipath setup? I'll
> check the code after finishing a few more things if no one remembers
> offhand.
>
Not yet.
I will work on it and send it soon with some another small bug fix I found.
Then we'll need to rebase this series on top.