Fix a possible NULL pointer dereference when trying to read
hwmon sysfs entries associated to NVMe-oF devices that were
hot-removed or disconnected.
Unregister the NVMe hwmon device upon controller teardown
(nvme_stop_ctrl()).
Signed-off-by: Enzo Matsumiya <[email protected]>
---
drivers/nvme/host/core.c | 1 +
drivers/nvme/host/hwmon.c | 8 ++++++++
drivers/nvme/host/nvme.h | 2 ++
3 files changed, 11 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9a270e49df17..becc80a0c3b8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4344,6 +4344,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
+ nvme_hwmon_exit(ctrl);
nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 552dbc04567b..7f62cca4c577 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -71,6 +71,9 @@ static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
int temp;
int err;
+ if (data->ctrl->state != NVME_CTRL_LIVE)
+ return -EAGAIN;
+
/*
* First handle attributes which don't require us to read
* the smart log.
@@ -253,3 +256,8 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
return 0;
}
+
+void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
+{
+ hwmon_device_unregister(ctrl->dev);
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 567f7ad18a91..621e9b1575f6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -807,11 +807,13 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
#ifdef CONFIG_NVME_HWMON
int nvme_hwmon_init(struct nvme_ctrl *ctrl);
+void nvme_hwmon_exit(struct nvme_ctrl *ctrl);
#else
static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl)
{
return 0;
}
+static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) { }
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
--
2.29.2
On Wed, Dec 09, 2020 at 06:32:27PM -0300, Enzo Matsumiya wrote:
> +void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
> +{
> + hwmon_device_unregister(ctrl->dev);
> +}
The hwmon registration uses the devm_ version, so don't we need to use
the devm_hwmon_device_unregister() here?
Enzo,
On 12/9/20 13:39, Enzo Matsumiya wrote:
> Fix a possible NULL pointer dereference when trying to read
> hwmon sysfs entries associated to NVMe-oF devices that were
> hot-removed or disconnected.
>
> Unregister the NVMe hwmon device upon controller teardown
> (nvme_stop_ctrl()).
>
> Signed-off-by: Enzo Matsumiya <[email protected]>
(Without looking into the code)
Do you have blktests testcases for NVMeOFhot-removed and disconnected
device scenario to produce the problem ?
On 12/9/20 10:32 PM, Enzo Matsumiya wrote:
> Fix a possible NULL pointer dereference when trying to read
> hwmon sysfs entries associated to NVMe-oF devices that were
> hot-removed or disconnected.
>
> Unregister the NVMe hwmon device upon controller teardown
> (nvme_stop_ctrl()).
>
> Signed-off-by: Enzo Matsumiya <[email protected]>
> ---
> drivers/nvme/host/core.c | 1 +
> drivers/nvme/host/hwmon.c | 8 ++++++++
> drivers/nvme/host/nvme.h | 2 ++
> 3 files changed, 11 insertions(+)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 9a270e49df17..becc80a0c3b8 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4344,6 +4344,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
>
> void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
> {
> + nvme_hwmon_exit(ctrl);
> nvme_mpath_stop(ctrl);
> nvme_stop_keep_alive(ctrl);
> flush_work(&ctrl->async_event_work);
> diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
> index 552dbc04567b..7f62cca4c577 100644
> --- a/drivers/nvme/host/hwmon.c
> +++ b/drivers/nvme/host/hwmon.c
> @@ -71,6 +71,9 @@ static int nvme_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
> int temp;
> int err;
>
> + if (data->ctrl->state != NVME_CTRL_LIVE)
> + return -EAGAIN;
> +
> /*
> * First handle attributes which don't require us to read
> * the smart log.
> @@ -253,3 +256,8 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
>
> return 0;
> }
> +
> +void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
> +{
> + hwmon_device_unregister(ctrl->dev);
> +}
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 567f7ad18a91..621e9b1575f6 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -807,11 +807,13 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
>
> #ifdef CONFIG_NVME_HWMON
> int nvme_hwmon_init(struct nvme_ctrl *ctrl);
> +void nvme_hwmon_exit(struct nvme_ctrl *ctrl);
> #else
> static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl)
> {
> return 0;
> }
> +static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) { }
> #endif
>
> u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
>
Something's fishy here.
The hwmon attributes should have been created only once for the lifetime
of the controller (creation is protected by '!initialized').
And the hwmon lifetime should've been controlled by the lifetime of the
controller (which to my knowledge was the idea behind the devm_* thingies).
So why do we have to deallocate the hwmon attributes?
And why on reset? And who's re-creating them after reset, seeing that
'initialized' should be true?
Hmm?
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
Hi Chaitanya,
On Thu, Dec 10, 2020 at 04:35:14AM +0000, Chaitanya Kulkarni wrote:
> Enzo,
> On 12/9/20 13:39, Enzo Matsumiya wrote:
> > Fix a possible NULL pointer dereference when trying to read
> > hwmon sysfs entries associated to NVMe-oF devices that were
> > hot-removed or disconnected.
> >
> > Unregister the NVMe hwmon device upon controller teardown
> > (nvme_stop_ctrl()).
> >
> > Signed-off-by: Enzo Matsumiya <[email protected]>
> (Without looking into the code)
>
> Do you have blktests testcases for NVMeOFhot-removed and disconnected
> device scenario to produce the problem ?
Here is a rough blktests' test to reproduce the problem. There is
clearly a race as the crash not always happens at the same iteration. On
my system I was able to hit consistently within 50 iterations. Maybe you
need to tweak it on your system to reproduce it.
Thanks,
Daniel
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0+
# Copyright (c) 2020 SUSE LLC
#
# Test NVMeOF read hwmon device info
. tests/nvme/rc
DESCRIPTION="read hwmon device info"
requires() {
_nvme_requires
_have_modules loop
_require_nvme_trtype_is_fabrics
}
_get_nvme_hwmon_devs() {
echo "/sys/devices/virtual/nvme-fabrics/ctl/hwmon*"
}
_read_nvme_hwmon_attr() {
hwmon_devs="$(_get_nvme_hwmon_devs)"
for hwmon in "${hwmon_devs}"; do
cat ${hwmon}"/name"
cat ${hwmon}"/temp1_label"
cat ${hwmon}"/temp1_alarm"
cat ${hwmon}"/temp1_input"
done
}
test() {
echo "Running ${TEST_NAME}"
_setup_nvmet
local port
local loop_dev
local file_path="$TMPDIR/img"
local subsys_name="blktests-subsystem-1"
truncate -s 1G "${file_path}"
loop_dev="$(losetup -f --show "${file_path}")"
_create_nvmet_subsystem "${subsys_name}" "${loop_dev}" \
"91fdba0d-f87b-4c25-b80f-db7be1418b9e"
port="$(_create_nvmet_port "${nvme_trtype}")"
_add_nvmet_subsys_to_port "${port}" "${subsys_name}"
for i in $(seq 1 50); do
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
_read_nvme_hwmon_attr
_nvme_disconnect_subsys "${subsys_name}"
_read_nvme_hwmon_attr
done
_remove_nvmet_subsystem_from_port "${port}" "${subsys_name}"
_remove_nvmet_subsystem "${subsys_name}"
_remove_nvmet_port "${port}"
losetup -d "${loop_dev}"
rm "${file_path}"
echo "Test complete"
}
On Fri, Dec 11, 2020 at 03:12:53PM +0100, Hannes Reinecke wrote:
> So why do we have to deallocate the hwmon attributes?
> And why on reset? And who's re-creating them after reset, seeing that
> 'initialized' should be true?
nvmet: adding nsid 1 to subsystem blktests-subsystem-1
nvmet: creating controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uu.
nvme-fabrics ctl: DEVRES ADD 000000009bc92dfd devm_kzalloc_release (552 bytes)
nvme-fabrics ctl: DEVRES ADD 0000000099e1e156 devm_hwmon_release (8 bytes)
I've enabled CONFIG_DEVRES_DEBUG and see only 'DEVRES ADD' message. If I
read it correctly the problem is that the resource is attached to the ctl
devm object and not for the nvme devm object.
On Wed, Dec 30, 2020 at 03:38:05PM +0100, Daniel Wagner wrote:
> I've enabled CONFIG_DEVRES_DEBUG and see only 'DEVRES ADD' message. If I
> read it correctly the problem is that the resource is attached to the ctl
> devm object and not for the nvme devm object.
I've attached the hwmon to the nvme object, which fixes the obvious
problem. But I still don't see any 'DEVRES REM' message. This would
indicate the nvme object never really goes away... more debugging...
On Wed, Dec 30, 2020 at 04:16:53PM +0100, Daniel Wagner wrote:
> I've attached the hwmon to the nvme object, which fixes the obvious
> problem. But I still don't see any 'DEVRES REM' message. This would
> indicate the nvme object never really goes away... more debugging...
Stupid me, the nvme device is not managed by devm.
Thank you all for the feedback. I'll re-evaluate my patch and resubmit
it soon.
The problem clearly exists as I can easily reproduce it on my test
setup, but the strategy to fix it needs some rework apparently.
@Daniel maybe try tweaking your tests to use a smaller controller
loss timeout (-l option)? I do this on my tests because the default
value kicks in about 30min after hot-removal -- i.e. you
have to actually wait for the timeout to expire to trigger the bug.
Cheers,
Enzo
On Mon, Jan 04, 2021 at 06:06:10PM -0300, Enzo Matsumiya wrote:
> @Daniel maybe try tweaking your tests to use a smaller controller
> loss timeout (-l option)? I do this on my tests because the default
> value kicks in about 30min after hot-removal -- i.e. you
> have to actually wait for the timeout to expire to trigger the bug.
As far I can tell, the blktests test I am using will trigger the same
bug. The problem is that the lifetime of hwmon sysfs entry should be
aligned to the lifetime of the nvme sysfs entry. Currently, hwmon's
lifetime is bound to the lifetime of the ctl sysfs entry. When the nvme
entry goes away (and obviously also the underlying device), the hwmon
sysfs entry still references it.
On 1/5/21 10:45 AM, Daniel Wagner wrote:
> On Mon, Jan 04, 2021 at 06:06:10PM -0300, Enzo Matsumiya wrote:
>> @Daniel maybe try tweaking your tests to use a smaller controller
>> loss timeout (-l option)? I do this on my tests because the default
>> value kicks in about 30min after hot-removal -- i.e. you
>> have to actually wait for the timeout to expire to trigger the bug.
>
> As far I can tell, the blktests test I am using will trigger the same
> bug. The problem is that the lifetime of hwmon sysfs entry should be
> aligned to the lifetime of the nvme sysfs entry. Currently, hwmon's
> lifetime is bound to the lifetime of the ctl sysfs entry. When the nvme
> entry goes away (and obviously also the underlying device), the hwmon
> sysfs entry still references it.
>
Yeah, using the controller node for devm allocations is quite dodgy.
Does this one help?
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 6fdd07fb3001..7260af028cf7 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -226,7 +226,7 @@ static const struct hwmon_chip_info
int nvme_hwmon_init(struct nvme_ctrl *ctrl)
{
- struct device *dev = ctrl->dev;
+ struct device *dev = ctrl->device;
struct nvme_hwmon_data *data;
struct device *hwmon;
int err;
@@ -240,8 +240,7 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
err = nvme_hwmon_get_smart_log(data);
if (err) {
- dev_warn(ctrl->device,
- "Failed to read smart log (error %d)\n", err);
+ dev_warn(dev, "Failed to read smart log (error %d)\n", err);
devm_kfree(dev, data);
return err;
}
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
We could just stop using devm and do explicit resource management.
Btw, for the next patch please cc the author of the nvme-hwmon code.
On Mon, Jan 11, 2021 at 05:00:18PM +0100, Hannes Reinecke wrote:
> Yeah, using the controller node for devm allocations is quite dodgy.
> Does this one help?
>
> diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
> index 6fdd07fb3001..7260af028cf7 100644
> --- a/drivers/nvme/host/hwmon.c
> +++ b/drivers/nvme/host/hwmon.c
> @@ -226,7 +226,7 @@ static const struct hwmon_chip_info
>
> int nvme_hwmon_init(struct nvme_ctrl *ctrl)
> {
> - struct device *dev = ctrl->dev;
> + struct device *dev = ctrl->device;
This is what I tried. Yes, it fixes the obvious problem and moves the
hwmon entry under the nvme entry. When the nvme is destroyed, the hwmon
entry is not accessible. But as ctrl->device is not managed by devm the
resources are not freed.