2020-07-08 13:37:24

by Lukas Wunner

[permalink] [raw]
Subject: [PATCH 2/3] driver core: Use rwsem for kill_device() serialization

kill_device() is currently serialized with driver probing by way of the
device_lock(). We're about to serialize it with device_add() as well
to prevent addition of children below a device which is going away.
However the parent's device_lock() cannot be taken by device_add()
lest deadlocks occur: Addition of the parent may result in addition
of children (as is the case with SPI controllers) and device_add()
already takes the device_lock through the call to bus_probe_device() ->
device_initial_probe() -> __device_attach().

Avoid such deadlocks by introducing an rw_semaphore whose specific
purpose is to serialize kill_device() with other parts of the kernel.

Use an rw_semaphore instead of a mutex because the latter would preclude
concurrent driver probing of multiple children below the same parent.
The semaphore is acquired for writing when declaring a device dead and
otherwise only acquired for reading. It is private to the driver core,
obviating the need to acquire a lock when calling kill_device(), as is
currently done in nvdimm's nd_device_unregister() and device_del().

An alternative approach would be to convert the device_lock() itself to
an rw_semaphore (instead of a mutex).

Signed-off-by: Lukas Wunner <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Geert Uytterhoeven <[email protected]>
Cc: Pantelis Antoniou <[email protected]>
Cc: Alexander Duyck <[email protected]>
---
drivers/base/base.h | 2 ++
drivers/base/core.c | 33 +++++++++++++++++++--------------
drivers/base/dd.c | 8 ++++++++
drivers/nvdimm/bus.c | 8 +-------
4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 95c22c0f90360..7e71a1d262ef6 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -79,6 +79,7 @@ struct driver_private {
* @async_driver - pointer to device driver awaiting probe via async_probe
* @device - pointer back to the struct device that this structure is
* associated with.
+ * @dead_sem - semaphore taken when declaring the device @dead.
* @dead - This device is currently either in the process of or has been
* removed from the system. Any asynchronous events scheduled for this
* device should exit without taking any action.
@@ -94,6 +95,7 @@ struct device_private {
struct list_head deferred_probe;
struct device_driver *async_driver;
struct device *device;
+ struct rw_semaphore dead_sem;
u8 dead:1;
};
#define to_device_private_parent(obj) \
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 67d39a90b45c7..057da42b1a660 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2526,6 +2526,7 @@ static int device_private_init(struct device *dev)
klist_init(&dev->p->klist_children, klist_children_get,
klist_children_put);
INIT_LIST_HEAD(&dev->p->deferred_probe);
+ init_rwsem(&dev->p->dead_sem);
return 0;
}

@@ -2780,21 +2781,27 @@ void put_device(struct device *dev)
}
EXPORT_SYMBOL_GPL(put_device);

+/**
+ * kill_device - declare device dead
+ * @dev: device in question
+ *
+ * Declare @dev dead to prevent it from binding to a driver.
+ * Return true if it was killed or false if it was already dead.
+ */
bool kill_device(struct device *dev)
{
- /*
- * Require the device lock and set the "dead" flag to guarantee that
- * the update behavior is consistent with the other bitfields near
- * it and that we cannot have an asynchronous probe routine trying
- * to run while we are tearing out the bus/class/sysfs from
- * underneath the device.
- */
- lockdep_assert_held(&dev->mutex);
+ bool killed;

- if (dev->p->dead)
- return false;
- dev->p->dead = true;
- return true;
+ down_write(&dev->p->dead_sem);
+ if (dev->p->dead) {
+ killed = false;
+ } else {
+ dev->p->dead = true;
+ killed = true;
+ }
+ up_write(&dev->p->dead_sem);
+
+ return killed;
}
EXPORT_SYMBOL_GPL(kill_device);

@@ -2817,9 +2824,7 @@ void device_del(struct device *dev)
struct kobject *glue_dir = NULL;
struct class_interface *class_intf;

- device_lock(dev);
kill_device(dev);
- device_unlock(dev);

if (dev->fwnode && dev->fwnode->dev == dev)
dev->fwnode->dev = NULL;
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 31c668651e824..9353d811cce83 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -817,6 +817,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
};

device_lock(dev);
+ down_read(&dev->p->dead_sem);

/*
* Check if device has already been removed or claimed. This may
@@ -838,6 +839,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
if (dev->parent)
pm_runtime_put(dev->parent);
out_unlock:
+ up_read(&dev->p->dead_sem);
device_unlock(dev);

put_device(dev);
@@ -848,6 +850,7 @@ static int __device_attach(struct device *dev, bool allow_async)
int ret = 0;

device_lock(dev);
+ down_read(&dev->p->dead_sem);
if (dev->p->dead) {
goto out_unlock;
} else if (dev->driver) {
@@ -893,6 +896,7 @@ static int __device_attach(struct device *dev, bool allow_async)
pm_runtime_put(dev->parent);
}
out_unlock:
+ up_read(&dev->p->dead_sem);
device_unlock(dev);
return ret;
}
@@ -967,6 +971,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
int ret = 0;

__device_driver_lock(dev, dev->parent);
+ down_read(&dev->p->dead_sem);

/*
* If device has been removed or someone has already successfully
@@ -975,6 +980,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
if (!dev->p->dead && !dev->driver)
ret = driver_probe_device(drv, dev);

+ up_read(&dev->p->dead_sem);
__device_driver_unlock(dev, dev->parent);

return ret;
@@ -987,6 +993,7 @@ static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
int ret = 0;

__device_driver_lock(dev, dev->parent);
+ down_read(&dev->p->dead_sem);

drv = dev->p->async_driver;

@@ -997,6 +1004,7 @@ static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
if (!dev->p->dead && !dev->driver)
ret = driver_probe_device(drv, dev);

+ up_read(&dev->p->dead_sem);
__device_driver_unlock(dev, dev->parent);

dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 09087c38fabdc..35e069c69386a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -559,8 +559,6 @@ EXPORT_SYMBOL(nd_device_register);

void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
{
- bool killed;
-
switch (mode) {
case ND_ASYNC:
/*
@@ -584,11 +582,7 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
* or otherwise let the async path handle it if the
* unregistration was already queued.
*/
- nd_device_lock(dev);
- killed = kill_device(dev);
- nd_device_unlock(dev);
-
- if (!killed)
+ if (!kill_device(dev))
return;

nd_synchronize();
--
2.27.0


2020-07-30 06:55:09

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/3] driver core: Use rwsem for kill_device() serialization

On Wed, Jul 08, 2020 at 03:27:02PM +0200, Lukas Wunner wrote:
> kill_device() is currently serialized with driver probing by way of the
> device_lock(). We're about to serialize it with device_add() as well
> to prevent addition of children below a device which is going away.

Why? Who does this? Shouldn't the bus that is trying to do this know
this is happening?

> However the parent's device_lock() cannot be taken by device_add()
> lest deadlocks occur: Addition of the parent may result in addition
> of children (as is the case with SPI controllers) and device_add()
> already takes the device_lock through the call to bus_probe_device() ->
> device_initial_probe() -> __device_attach().
>
> Avoid such deadlocks by introducing an rw_semaphore whose specific
> purpose is to serialize kill_device() with other parts of the kernel.

Ugh, another lock, really? :(

> Use an rw_semaphore instead of a mutex because the latter would preclude
> concurrent driver probing of multiple children below the same parent.
> The semaphore is acquired for writing when declaring a device dead and
> otherwise only acquired for reading. It is private to the driver core,
> obviating the need to acquire a lock when calling kill_device(), as is
> currently done in nvdimm's nd_device_unregister() and device_del().
>
> An alternative approach would be to convert the device_lock() itself to
> an rw_semaphore (instead of a mutex).
>
> Signed-off-by: Lukas Wunner <[email protected]>
> Cc: Dan Williams <[email protected]>
> Cc: Geert Uytterhoeven <[email protected]>
> Cc: Pantelis Antoniou <[email protected]>
> Cc: Alexander Duyck <[email protected]>
> ---
> drivers/base/base.h | 2 ++
> drivers/base/core.c | 33 +++++++++++++++++++--------------
> drivers/base/dd.c | 8 ++++++++
> drivers/nvdimm/bus.c | 8 +-------
> 4 files changed, 30 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/base/base.h b/drivers/base/base.h
> index 95c22c0f90360..7e71a1d262ef6 100644
> --- a/drivers/base/base.h
> +++ b/drivers/base/base.h
> @@ -79,6 +79,7 @@ struct driver_private {
> * @async_driver - pointer to device driver awaiting probe via async_probe
> * @device - pointer back to the struct device that this structure is
> * associated with.
> + * @dead_sem - semaphore taken when declaring the device @dead.
> * @dead - This device is currently either in the process of or has been
> * removed from the system. Any asynchronous events scheduled for this
> * device should exit without taking any action.
> @@ -94,6 +95,7 @@ struct device_private {
> struct list_head deferred_probe;
> struct device_driver *async_driver;
> struct device *device;
> + struct rw_semaphore dead_sem;
> u8 dead:1;
> };
> #define to_device_private_parent(obj) \
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 67d39a90b45c7..057da42b1a660 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -2526,6 +2526,7 @@ static int device_private_init(struct device *dev)
> klist_init(&dev->p->klist_children, klist_children_get,
> klist_children_put);
> INIT_LIST_HEAD(&dev->p->deferred_probe);
> + init_rwsem(&dev->p->dead_sem);
> return 0;
> }
>
> @@ -2780,21 +2781,27 @@ void put_device(struct device *dev)
> }
> EXPORT_SYMBOL_GPL(put_device);
>
> +/**
> + * kill_device - declare device dead
> + * @dev: device in question
> + *
> + * Declare @dev dead to prevent it from binding to a driver.
> + * Return true if it was killed or false if it was already dead.
> + */
> bool kill_device(struct device *dev)
> {
> - /*
> - * Require the device lock and set the "dead" flag to guarantee that
> - * the update behavior is consistent with the other bitfields near
> - * it and that we cannot have an asynchronous probe routine trying
> - * to run while we are tearing out the bus/class/sysfs from
> - * underneath the device.
> - */
> - lockdep_assert_held(&dev->mutex);
> + bool killed;
>
> - if (dev->p->dead)
> - return false;
> - dev->p->dead = true;
> - return true;
> + down_write(&dev->p->dead_sem);
> + if (dev->p->dead) {
> + killed = false;
> + } else {
> + dev->p->dead = true;
> + killed = true;
> + }
> + up_write(&dev->p->dead_sem);
> +
> + return killed;
> }
> EXPORT_SYMBOL_GPL(kill_device);

meta-comment, this should have been device_kill() :(

>
> @@ -2817,9 +2824,7 @@ void device_del(struct device *dev)
> struct kobject *glue_dir = NULL;
> struct class_interface *class_intf;
>
> - device_lock(dev);
> kill_device(dev);
> - device_unlock(dev);
>
> if (dev->fwnode && dev->fwnode->dev == dev)
> dev->fwnode->dev = NULL;
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 31c668651e824..9353d811cce83 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -817,6 +817,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> };
>
> device_lock(dev);
> + down_read(&dev->p->dead_sem);
>
> /*
> * Check if device has already been removed or claimed. This may
> @@ -838,6 +839,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> if (dev->parent)
> pm_runtime_put(dev->parent);
> out_unlock:
> + up_read(&dev->p->dead_sem);
> device_unlock(dev);
>
> put_device(dev);
> @@ -848,6 +850,7 @@ static int __device_attach(struct device *dev, bool allow_async)
> int ret = 0;
>
> device_lock(dev);
> + down_read(&dev->p->dead_sem);

Ick, see, now two locks for the same device structure? I really don't
like that as the complexity involved is now double.


> if (dev->p->dead) {
> goto out_unlock;
> } else if (dev->driver) {
> @@ -893,6 +896,7 @@ static int __device_attach(struct device *dev, bool allow_async)
> pm_runtime_put(dev->parent);
> }
> out_unlock:
> + up_read(&dev->p->dead_sem);
> device_unlock(dev);
> return ret;
> }
> @@ -967,6 +971,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
> int ret = 0;
>
> __device_driver_lock(dev, dev->parent);
> + down_read(&dev->p->dead_sem);
>
> /*
> * If device has been removed or someone has already successfully
> @@ -975,6 +980,7 @@ int device_driver_attach(struct device_driver *drv, struct device *dev)
> if (!dev->p->dead && !dev->driver)
> ret = driver_probe_device(drv, dev);
>
> + up_read(&dev->p->dead_sem);
> __device_driver_unlock(dev, dev->parent);
>
> return ret;
> @@ -987,6 +993,7 @@ static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
> int ret = 0;
>
> __device_driver_lock(dev, dev->parent);
> + down_read(&dev->p->dead_sem);
>
> drv = dev->p->async_driver;
>
> @@ -997,6 +1004,7 @@ static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie)
> if (!dev->p->dead && !dev->driver)
> ret = driver_probe_device(drv, dev);
>
> + up_read(&dev->p->dead_sem);
> __device_driver_unlock(dev, dev->parent);
>
> dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret);
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index 09087c38fabdc..35e069c69386a 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -559,8 +559,6 @@ EXPORT_SYMBOL(nd_device_register);
>
> void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
> {
> - bool killed;
> -
> switch (mode) {
> case ND_ASYNC:
> /*
> @@ -584,11 +582,7 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
> * or otherwise let the async path handle it if the
> * unregistration was already queued.
> */
> - nd_device_lock(dev);
> - killed = kill_device(dev);
> - nd_device_unlock(dev);
> -
> - if (!killed)
> + if (!kill_device(dev))

So, why are you pushing this down into the driver core, can't this be
done in whatever crazy bus wants to do this, like is done here?

That will save us the extra lock complexity in the driver core.

What bus wants/needs this now?

thanks,

greg k-h

2020-07-30 09:58:57

by Lukas Wunner

[permalink] [raw]
Subject: Re: [PATCH 2/3] driver core: Use rwsem for kill_device() serialization

On Thu, Jul 30, 2020 at 08:53:26AM +0200, Greg Kroah-Hartman wrote:
> On Wed, Jul 08, 2020 at 03:27:02PM +0200, Lukas Wunner wrote:
> > kill_device() is currently serialized with driver probing by way of the
> > device_lock(). We're about to serialize it with device_add() as well
> > to prevent addition of children below a device which is going away.
>
> Why? Who does this? Shouldn't the bus that is trying to do this know
> this is happening?

AFAICS, at least spi and i2c are affected.

I first thought that pci is affected as well but it seems the global
pci_lock_rescan_remove() performs the required serialization.

I've yet to take a closer look at acpi and usb. Any bus which
creates a device hierarchy with dynamic addition & removal needs
to make sure no new children are added after removal of the parent
has begun.


> So, why are you pushing this down into the driver core, can't this be
> done in whatever crazy bus wants to do this, like is done here?

I guess it can. Let me try to perform the locking at the bus level then.

Thanks,

Lukas

2020-07-31 06:48:11

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 2/3] driver core: Use rwsem for kill_device() serialization

On Thu, Jul 30, 2020 at 11:56:10AM +0200, Lukas Wunner wrote:
> On Thu, Jul 30, 2020 at 08:53:26AM +0200, Greg Kroah-Hartman wrote:
> > On Wed, Jul 08, 2020 at 03:27:02PM +0200, Lukas Wunner wrote:
> > > kill_device() is currently serialized with driver probing by way of the
> > > device_lock(). We're about to serialize it with device_add() as well
> > > to prevent addition of children below a device which is going away.
> >
> > Why? Who does this? Shouldn't the bus that is trying to do this know
> > this is happening?
>
> AFAICS, at least spi and i2c are affected.
>
> I first thought that pci is affected as well but it seems the global
> pci_lock_rescan_remove() performs the required serialization.
>
> I've yet to take a closer look at acpi and usb. Any bus which
> creates a device hierarchy with dynamic addition & removal needs
> to make sure no new children are added after removal of the parent
> has begun.
>
>
> > So, why are you pushing this down into the driver core, can't this be
> > done in whatever crazy bus wants to do this, like is done here?
>
> I guess it can. Let me try to perform the locking at the bus level then.

I thought the bus code itself had this type of serialization already...

2020-07-31 09:55:42

by Lukas Wunner

[permalink] [raw]
Subject: Re: [PATCH 2/3] driver core: Use rwsem for kill_device() serialization

On Fri, Jul 31, 2020 at 08:45:05AM +0200, Greg Kroah-Hartman wrote:
> On Thu, Jul 30, 2020 at 11:56:10AM +0200, Lukas Wunner wrote:
> > On Thu, Jul 30, 2020 at 08:53:26AM +0200, Greg Kroah-Hartman wrote:
> > > On Wed, Jul 08, 2020 at 03:27:02PM +0200, Lukas Wunner wrote:
> > > > kill_device() is currently serialized with driver probing by way of the
> > > > device_lock(). We're about to serialize it with device_add() as well
> > > > to prevent addition of children below a device which is going away.
> > >
> > > Why? Who does this? Shouldn't the bus that is trying to do this know
> > > this is happening?
> >
> > AFAICS, at least spi and i2c are affected. Any bus which
> > creates a device hierarchy with dynamic addition & removal needs
> > to make sure no new children are added after removal of the parent
> > has begun.
>
> I thought the bus code itself had this type of serialization already...

An SPI device is inaccessible once the controller has been torn down,
yet drivers may need to reach SPI devices to unbind cleanly (e.g. to
quiesce interrupts on SPI devices).

Therefore SPI devices need to be unregistered first and the controller
last. However with CONFIG_OF_DYNAMIC=y, an SPI device may be added
at runtime via of_spi_notify(), which does take a ref on the controller,
but otherwise runs lockless against spi_unregister_controller().

What can happen here is an SPI device gets instantiated below a
controller as it is being removed and the SPI device can't be unbound
or removed cleanly because it's inaccessible.

The bus code can't do anything about this. It doesn't learn about the
controller going away until device_unregister() is called at the *end* of
spi_unregister_controller().

Anyway, the preliminary patch below should do the trick and I've also
cooked up something similar for i2c. Needs to be tested still.

Thanks,

Lukas

-- >8 --

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 6626587..b6876dd 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -475,6 +475,12 @@ struct boardinfo {
*/
static DEFINE_MUTEX(board_lock);

+/*
+ * Prevents addition of devices with same chip select and
+ * addition of devices below an unregistering controller.
+ */
+static DEFINE_MUTEX(spi_add_lock);
+
/**
* spi_alloc_device - Allocate a new SPI device
* @ctlr: Controller to which device is connected
@@ -554,7 +560,6 @@ static int spi_dev_check(struct device *dev, void *data)
*/
int spi_add_device(struct spi_device *spi)
{
- static DEFINE_MUTEX(spi_add_lock);
struct spi_controller *ctlr = spi->controller;
struct device *dev = ctlr->dev.parent;
int status;
@@ -575,6 +580,12 @@ int spi_add_device(struct spi_device *spi)
*/
mutex_lock(&spi_add_lock);

+ if ((IS_ENABLED(CONFIG_OF_DYNAMIC) || IS_ENABLED(CONFIG_ACPI) ||
+ IS_ENABLED(CONFIG_SPI_SLAVE)) && ctlr->unregistering) {
+ status = -ENODEV;
+ goto done;
+ }
+
status = bus_for_each_dev(&spi_bus_type, NULL, spi, spi_dev_check);
if (status) {
dev_err(dev, "chipselect %d already in use\n",
@@ -2795,6 +2806,13 @@ void spi_unregister_controller(struct spi_controller *ctlr)
struct spi_controller *found;
int id = ctlr->bus_num;

+ /* Prevent addition of new devices, then remove existing ones */
+ if (IS_ENABLED(CONFIG_OF_DYNAMIC) || IS_ENABLED(CONFIG_ACPI) ||
+ IS_ENABLED(CONFIG_SPI_SLAVE)) {
+ mutex_lock(&spi_add_lock);
+ ctlr->unregistering = true;
+ mutex_unlock(&spi_add_lock);
+ }
device_for_each_child(&ctlr->dev, NULL, __unregister);

/* First make sure that this controller was ever added */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 99380c0..6d95515 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -451,6 +451,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
* @irq_flags: Interrupt enable state during PTP system timestamping
* @fallback: fallback to pio if dma transfer return failure with
* SPI_TRANS_FAIL_NO_START.
+ * @unregistering: Set on controller removal, prevents addition of new devices.
*
* Each SPI controller can communicate with one or more @spi_device
* children. These make a small bus, sharing MOSI, MISO and SCK signals
@@ -667,6 +668,8 @@ struct spi_controller {

/* Interrupt enable state during PTP system timestamping */
unsigned long irq_flags;
+
+ unsigned int unregistering:1;
};

static inline void *spi_controller_get_devdata(struct spi_controller *ctlr)