2013-05-08 06:56:51

by Takashi Iwai

[permalink] [raw]
Subject: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

Hi,

this is a series of patches for the issue we faced in the firmware
loader code during debugging the problem with dell_rbu driver with
3.9 kernel.

The original problem was that the shutdown gets stuck when DELL BIOS
update is performed. This turned out to be a problem in the firmware
loader. Although the reason of dell_rbu driver breakage is still
unclear, we should fix the firmware loader side, at least, not to
stall during shutdown.


thanks,

Takashi


2013-05-08 06:56:50

by Takashi Iwai

[permalink] [raw]
Subject: [PATCH 3/3] dell_rbu: Select CONFIG_FW_LOADER_USER_HELPER explicitly

The usermode helper is mandatory for this driver.

Signed-off-by: Takashi Iwai <[email protected]>
---
drivers/firmware/Kconfig | 1 +
1 file changed, 1 insertion(+)

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 9387630..0747872 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -64,6 +64,7 @@ config DELL_RBU
tristate "BIOS update support for DELL systems via sysfs"
depends on X86
select FW_LOADER
+ select FW_LOADER_USER_HELPER
help
Say m if you want to have the option of updating the BIOS for your
DELL system. Note you need a Dell OpenManage or Dell Update package (DUP)
--
1.8.2.1

2013-05-08 06:57:21

by Takashi Iwai

[permalink] [raw]
Subject: [PATCH 2/3] firmware: Avoid deadlock of usermodehelper lock at shutdown

When a system goes to reboot/shutdown, it tries to disable the
usermode helper via usermodehelper_disable(). This might be blocked
when a driver tries to load a firmware beforehand and it's stuck by
some reason.

In this patch, the firmware class driver registers a reboot notifier
so that it can abort all pending f/w bufs. Also enable a flag for
avoiding the call of usermodehelper after the reboot/shutdown starts.

Signed-off-by: Takashi Iwai <[email protected]>
---
drivers/base/firmware_class.c | 47 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 9e1d39e..0cad0b4 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -27,6 +27,7 @@
#include <linux/pm.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/reboot.h>

#include <generated/utsrelease.h>

@@ -171,6 +172,7 @@ static struct firmware_buf *__allocate_fw_buf(const char *fw_name,
strcpy(buf->fw_id, fw_name);
buf->fwc = fwc;
init_completion(&buf->completion);
+ INIT_LIST_HEAD(&buf->list);

pr_debug("%s: fw-%s buf=%p\n", __func__, fw_name, buf);

@@ -446,10 +448,9 @@ static struct firmware_priv *to_firmware_priv(struct device *dev)
return container_of(dev, struct firmware_priv, dev);
}

-static void fw_load_abort(struct firmware_priv *fw_priv)
+static void fw_load_abort(struct firmware_buf *buf)
{
- struct firmware_buf *buf = fw_priv->buf;
-
+ list_del_init(&buf->list);
set_bit(FW_STATUS_ABORT, &buf->status);
complete_all(&buf->completion);
}
@@ -457,6 +458,26 @@ static void fw_load_abort(struct firmware_priv *fw_priv)
#define is_fw_load_aborted(buf) \
test_bit(FW_STATUS_ABORT, &(buf)->status)

+static LIST_HEAD(pending_fw_head);
+static bool in_shutdown;
+
+/* reboot notifier for avoid deadlock with usermode_lock */
+static int fw_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fw_lock);
+ in_shutdown = true;
+ while (!list_empty(&pending_fw_head))
+ fw_load_abort(list_first_entry(&pending_fw_head,
+ struct firmware_buf, list));
+ mutex_unlock(&fw_lock);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fw_shutdown_nb = {
+ .notifier_call = fw_shutdown_notify,
+};
+
static ssize_t firmware_timeout_show(struct class *class,
struct class_attribute *attr,
char *buf)
@@ -604,6 +625,7 @@ static ssize_t firmware_loading_store(struct device *dev,
* is completed.
* */
fw_map_pages_buf(fw_buf);
+ list_del_init(&fw_buf->list);
complete_all(&fw_buf->completion);
break;
}
@@ -612,7 +634,7 @@ static ssize_t firmware_loading_store(struct device *dev,
dev_err(dev, "%s: unexpected value (%d)\n", __func__, loading);
/* fallthrough */
case -1:
- fw_load_abort(fw_priv);
+ fw_load_abort(fw_buf);
break;
}
out:
@@ -680,7 +702,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
new_pages = kmalloc(new_array_size * sizeof(void *),
GFP_KERNEL);
if (!new_pages) {
- fw_load_abort(fw_priv);
+ fw_load_abort(buf);
return -ENOMEM;
}
memcpy(new_pages, buf->pages,
@@ -697,7 +719,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
alloc_page(GFP_KERNEL | __GFP_HIGHMEM);

if (!buf->pages[buf->nr_pages]) {
- fw_load_abort(fw_priv);
+ fw_load_abort(buf);
return -ENOMEM;
}
buf->nr_pages++;
@@ -781,7 +803,7 @@ static void firmware_class_timeout_work(struct work_struct *work)
mutex_unlock(&fw_lock);
return;
}
- fw_load_abort(fw_priv);
+ fw_load_abort(fw_priv->buf);
mutex_unlock(&fw_lock);
}

@@ -857,6 +879,10 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent,
kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD);
}

+ mutex_lock(&fw_lock);
+ list_add(&buf->list, &pending_fw_head);
+ mutex_unlock(&fw_lock);
+
wait_for_completion(&buf->completion);

cancel_delayed_work_sync(&fw_priv->timeout_work);
@@ -897,6 +923,7 @@ fw_load_from_user_helper(struct firmware *firmware, const char *name,

/* No abort during direct loading */
#define is_fw_load_aborted(buf) false
+#define in_shutdown false

#endif /* CONFIG_FW_LOADER_USER_HELPER */

@@ -1019,6 +1046,10 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
ret = 0;
if (!fw_get_filesystem_firmware(device, fw->priv)) {
long timeout = firmware_loading_timeout();
+ if (WARN_ON(in_shutdown)) {
+ ret = -EBUSY;
+ goto out;
+ }
if (nowait) {
timeout = usermodehelper_read_lock_wait(timeout);
if (!timeout) {
@@ -1521,6 +1552,7 @@ static int __init firmware_class_init(void)
{
fw_cache_init();
#ifdef CONFIG_FW_LOADER_USER_HELPER
+ register_reboot_notifier(&fw_shutdown_nb);
return class_register(&firmware_class);
#else
return 0;
@@ -1534,6 +1566,7 @@ static void __exit firmware_class_exit(void)
unregister_pm_notifier(&fw_cache.pm_notify);
#endif
#ifdef CONFIG_FW_LOADER_USER_HELPER
+ unregister_reboot_notifier(&fw_shutdown_nb);
class_unregister(&firmware_class);
#endif
}
--
1.8.2.1

2013-05-08 06:57:20

by Takashi Iwai

[permalink] [raw]
Subject: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

When a firmware file can be loaded directly, there is no good reason
to lock usermodehelper. It's needed only when the direct fw load
fails and falls back to the user-mode helper.

Signed-off-by: Takashi Iwai <[email protected]>
---
drivers/base/firmware_class.c | 42 +++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 4b1f926..9e1d39e 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -1006,7 +1006,7 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
struct device *device, bool uevent, bool nowait)
{
struct firmware *fw;
- long timeout;
+ bool usermode_locked = false;
int ret;

if (!firmware_p)
@@ -1017,31 +1017,35 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
goto out;

ret = 0;
- timeout = firmware_loading_timeout();
- if (nowait) {
- timeout = usermodehelper_read_lock_wait(timeout);
- if (!timeout) {
- dev_dbg(device, "firmware: %s loading timed out\n",
- name);
- ret = -EBUSY;
- goto out;
- }
- } else {
- ret = usermodehelper_read_trylock();
- if (WARN_ON(ret)) {
- dev_err(device, "firmware: %s will not be loaded\n",
- name);
- goto out;
+ if (!fw_get_filesystem_firmware(device, fw->priv)) {
+ long timeout = firmware_loading_timeout();
+ if (nowait) {
+ timeout = usermodehelper_read_lock_wait(timeout);
+ if (!timeout) {
+ dev_dbg(device, "firmware: %s loading timed out\n",
+ name);
+ ret = -EBUSY;
+ goto out;
+ }
+ } else {
+ ret = usermodehelper_read_trylock();
+ if (WARN_ON(ret)) {
+ dev_err(device, "firmware: %s will not be loaded\n",
+ name);
+ goto out;
+ }
}
- }

- if (!fw_get_filesystem_firmware(device, fw->priv))
+ usermode_locked = true;
ret = fw_load_from_user_helper(fw, name, device,
uevent, nowait, timeout);
+ }
+
if (!ret)
ret = assign_firmware_buf(fw, device);

- usermodehelper_read_unlock();
+ if (usermode_locked)
+ usermodehelper_read_unlock();

out:
if (ret < 0) {
--
1.8.2.1

2013-05-08 15:42:20

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

On Wed, May 08, 2013 at 08:56:34AM +0200, Takashi Iwai wrote:
> Hi,
>
> this is a series of patches for the issue we faced in the firmware
> loader code during debugging the problem with dell_rbu driver with
> 3.9 kernel.
>
> The original problem was that the shutdown gets stuck when DELL BIOS
> update is performed. This turned out to be a problem in the firmware
> loader. Although the reason of dell_rbu driver breakage is still
> unclear, we should fix the firmware loader side, at least, not to
> stall during shutdown.

This is the second time this has been sent, right? Any differences from
the first set?

I'll get to these after 3.10-rc1 is out, thanks.

greg k-h

2013-05-08 15:48:01

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

At Wed, 8 May 2013 08:42:16 -0700,
Greg Kroah-Hartman wrote:
>
> On Wed, May 08, 2013 at 08:56:34AM +0200, Takashi Iwai wrote:
> > Hi,
> >
> > this is a series of patches for the issue we faced in the firmware
> > loader code during debugging the problem with dell_rbu driver with
> > 3.9 kernel.
> >
> > The original problem was that the shutdown gets stuck when DELL BIOS
> > update is performed. This turned out to be a problem in the firmware
> > loader. Although the reason of dell_rbu driver breakage is still
> > unclear, we should fix the firmware loader side, at least, not to
> > stall during shutdown.
>
> This is the second time this has been sent, right? Any differences from
> the first set?

Ah, sorry, you received it already.

I noticed that the patches via git-send-email weren't sent out
properly to some places (like LKML) because of my bad setup of email
(totally forgotten after the fresh installation), so I resent them
again yesterday.

I should have mentioned it in the cover page. Sorry for confusion.

> I'll get to these after 3.10-rc1 is out, thanks.

OK, thanks!


Takashi

2013-05-08 15:52:08

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> When a firmware file can be loaded directly, there is no good reason
> to lock usermodehelper. It's needed only when the direct fw load
> fails and falls back to the user-mode helper.

I remembered that we discussed the problem before, :-)

Some crazy drivers might call request_firmware inside resume callback
(for example, USB devices might be rebind in resume), with
usermodehelper_read_lock, we can find the mistake easily and log it.

I am wondering if it is good to remove the usermodehelper lock.

Could you let us know any benefit to do it?

Thanks,
--
Ming Lei

2013-05-08 15:56:55

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 2/3] firmware: Avoid deadlock of usermodehelper lock at shutdown

On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> When a system goes to reboot/shutdown, it tries to disable the
> usermode helper via usermodehelper_disable(). This might be blocked
> when a driver tries to load a firmware beforehand and it's stuck by
> some reason.

IMO, it is better to find why the loading is stuck. Also we already provides
the timeout sysfs file to help to deal with the situation.

>
> In this patch, the firmware class driver registers a reboot notifier
> so that it can abort all pending f/w bufs. Also enable a flag for
> avoiding the call of usermodehelper after the reboot/shutdown starts.

With this patch, maybe we only hide the real problem.

Thanks,
--
Ming Lei

2013-05-08 15:59:31

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

On Wed, May 8, 2013 at 11:42 PM, Greg Kroah-Hartman
<[email protected]> wrote:
>
> This is the second time this has been sent, right? Any differences from
> the first set?

Sorry to not see that.

> I'll get to these after 3.10-rc1 is out, thanks.

I think we need to discuss the patch further, see my comments on
patches, :-)

Thanks,
--
Ming Lei

2013-05-08 16:07:24

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

Hi Takashi,

Sorry for not CC list.

On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> Hi,
>
> this is a series of patches for the issue we faced in the firmware
> loader code during debugging the problem with dell_rbu driver with
> 3.9 kernel.
>
> The original problem was that the shutdown gets stuck when DELL BIOS
> update is performed. This turned out to be a problem in the firmware
> loader. Although the reason of dell_rbu driver breakage is still

Sorry, from these patchset, I can't see why it is a problem in firmware.

> unclear, we should fix the firmware loader side, at least, not to
> stall during shutdown.

Firstly you need to describe what/why is the stall? In fact, firmware
loading can't stall forever and it will timeout, but the current 60sec
timeout might be too long.

Thanks,
--
Ming Lei

2013-05-08 16:15:35

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 2/3] firmware: Avoid deadlock of usermodehelper lock at shutdown

At Wed, 8 May 2013 23:56:51 +0800,
Ming Lei wrote:
>
> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> > When a system goes to reboot/shutdown, it tries to disable the
> > usermode helper via usermodehelper_disable(). This might be blocked
> > when a driver tries to load a firmware beforehand and it's stuck by
> > some reason.
>
> IMO, it is better to find why the loading is stuck. Also we already provides
> the timeout sysfs file to help to deal with the situation.

The loading is done manually in the case of dell_rbu driver, so it may
happen at any time that the f/w loading doesn't finish properly.


> > In this patch, the firmware class driver registers a reboot notifier
> > so that it can abort all pending f/w bufs. Also enable a flag for
> > avoiding the call of usermodehelper after the reboot/shutdown starts.
>
> With this patch, maybe we only hide the real problem.

No, you can simulate the hang easily. Try the below (you can run it
on every x86 machine; it just loads the data onto the memory.)

- modprobe dell_rbu

- echo init > /sys/devices/platform/dell_rbu/image_type

(... now /sys/class/firmware/dell_rbu/* appear)

- echo 1 > /sys/class/firmware/dell_rbu/loading

- halt -p

Now the machine gets stuck.

Yes, the real problem is obvious: you didn't finish the f/w loading in
the above, and it will never happen. This doesn't justify that the
machine can be stalled at shutdown, though.


Takashi

2013-05-08 16:21:11

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Wed, 8 May 2013 23:52:02 +0800,
Ming Lei wrote:
>
> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> > When a firmware file can be loaded directly, there is no good reason
> > to lock usermodehelper. It's needed only when the direct fw load
> > fails and falls back to the user-mode helper.
>
> I remembered that we discussed the problem before, :-)
>
> Some crazy drivers might call request_firmware inside resume callback
> (for example, USB devices might be rebind in resume), with
> usermodehelper_read_lock, we can find the mistake easily and log it.
>
> I am wondering if it is good to remove the usermodehelper lock.
>
> Could you let us know any benefit to do it?

Well, the question is whether usermodehelper lock is really an
appropriate stuff for *checking* the availability of direct fs
access. I find it doesn't fit well any longer, in the situation where
no actual user-space call is needed. Though, I'm not quite sure which
lock or flag can be used instead...


Takashi

2013-05-08 16:26:56

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

At Thu, 9 May 2013 00:07:17 +0800,
Ming Lei wrote:
>
> Hi Takashi,
>
> Sorry for not CC list.
>
> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> > Hi,
> >
> > this is a series of patches for the issue we faced in the firmware
> > loader code during debugging the problem with dell_rbu driver with
> > 3.9 kernel.
> >
> > The original problem was that the shutdown gets stuck when DELL BIOS
> > update is performed. This turned out to be a problem in the firmware
> > loader. Although the reason of dell_rbu driver breakage is still
>
> Sorry, from these patchset, I can't see why it is a problem in firmware.
>
> > unclear, we should fix the firmware loader side, at least, not to
> > stall during shutdown.
>
> Firstly you need to describe what/why is the stall? In fact, firmware
> loading can't stall forever and it will timeout, but the current 60sec
> timeout might be too long.

The timeout check is activated only when uevent flag is set, and
dell_rbu driver doesn't set it explicitly (because it's not supposed
to be handled via udev or whatever).


Takashi

2013-05-08 16:37:17

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Wed, 08 May 2013 18:21:11 +0200,
Takashi Iwai wrote:
>
> At Wed, 8 May 2013 23:52:02 +0800,
> Ming Lei wrote:
> >
> > On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> > > When a firmware file can be loaded directly, there is no good reason
> > > to lock usermodehelper. It's needed only when the direct fw load
> > > fails and falls back to the user-mode helper.
> >
> > I remembered that we discussed the problem before, :-)
> >
> > Some crazy drivers might call request_firmware inside resume callback
> > (for example, USB devices might be rebind in resume), with
> > usermodehelper_read_lock, we can find the mistake easily and log it.
> >
> > I am wondering if it is good to remove the usermodehelper lock.
> >
> > Could you let us know any benefit to do it?
>
> Well, the question is whether usermodehelper lock is really an
> appropriate stuff for *checking* the availability of direct fs
> access. I find it doesn't fit well any longer, in the situation where
> no actual user-space call is needed. Though, I'm not quite sure which
> lock or flag can be used instead...

In other words, the first patch is no essential part of the fix.
I can revisit the second patch without this one and resend if
preferred.


Takashi

2013-05-08 17:51:30

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Wed, 08 May 2013 18:37:01 +0200,
Takashi Iwai wrote:
>
> At Wed, 08 May 2013 18:21:11 +0200,
> Takashi Iwai wrote:
> >
> > At Wed, 8 May 2013 23:52:02 +0800,
> > Ming Lei wrote:
> > >
> > > On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> > > > When a firmware file can be loaded directly, there is no good reason
> > > > to lock usermodehelper. It's needed only when the direct fw load
> > > > fails and falls back to the user-mode helper.
> > >
> > > I remembered that we discussed the problem before, :-)
> > >
> > > Some crazy drivers might call request_firmware inside resume callback
> > > (for example, USB devices might be rebind in resume), with
> > > usermodehelper_read_lock, we can find the mistake easily and log it.
> > >
> > > I am wondering if it is good to remove the usermodehelper lock.
> > >
> > > Could you let us know any benefit to do it?
> >
> > Well, the question is whether usermodehelper lock is really an
> > appropriate stuff for *checking* the availability of direct fs
> > access. I find it doesn't fit well any longer, in the situation where
> > no actual user-space call is needed. Though, I'm not quite sure which
> > lock or flag can be used instead...
>
> In other words, the first patch is no essential part of the fix.
> I can revisit the second patch without this one and resend if
> preferred.

FWIW, below is the revised patch.
It's alone without the patch 1 in the previous series.


Takashi

---
From: Takashi Iwai <[email protected]>
Subject: [PATCH v2] firmware: Avoid deadlock of usermodehelper lock at shutdown

When a system goes to reboot/shutdown, it tries to disable the
usermode helper via usermodehelper_disable(). This might be blocked
when a driver tries to load a firmware beforehand and it's stuck by
some reason.

In this patch, the firmware class driver registers a reboot notifier
so that it can abort all pending f/w bufs. Also enable a flag for
avoiding the call of usermodehelper after the reboot/shutdown starts.

Signed-off-by: Takashi Iwai <[email protected]>
---
drivers/base/firmware_class.c | 40 +++++++++++++++++++++++++++++++++-------
1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 4b1f926..972e535 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -27,6 +27,7 @@
#include <linux/pm.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/reboot.h>

#include <generated/utsrelease.h>

@@ -171,6 +172,7 @@ static struct firmware_buf *__allocate_fw_buf(const char *fw_name,
strcpy(buf->fw_id, fw_name);
buf->fwc = fwc;
init_completion(&buf->completion);
+ INIT_LIST_HEAD(&buf->list);

pr_debug("%s: fw-%s buf=%p\n", __func__, fw_name, buf);

@@ -446,10 +448,9 @@ static struct firmware_priv *to_firmware_priv(struct device *dev)
return container_of(dev, struct firmware_priv, dev);
}

-static void fw_load_abort(struct firmware_priv *fw_priv)
+static void fw_load_abort(struct firmware_buf *buf)
{
- struct firmware_buf *buf = fw_priv->buf;
-
+ list_del_init(&buf->list);
set_bit(FW_STATUS_ABORT, &buf->status);
complete_all(&buf->completion);
}
@@ -457,6 +458,24 @@ static void fw_load_abort(struct firmware_priv *fw_priv)
#define is_fw_load_aborted(buf) \
test_bit(FW_STATUS_ABORT, &(buf)->status)

+static LIST_HEAD(pending_fw_head);
+
+/* reboot notifier for avoid deadlock with usermode_lock */
+static int fw_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fw_lock);
+ while (!list_empty(&pending_fw_head))
+ fw_load_abort(list_first_entry(&pending_fw_head,
+ struct firmware_buf, list));
+ mutex_unlock(&fw_lock);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fw_shutdown_nb = {
+ .notifier_call = fw_shutdown_notify,
+};
+
static ssize_t firmware_timeout_show(struct class *class,
struct class_attribute *attr,
char *buf)
@@ -604,6 +623,7 @@ static ssize_t firmware_loading_store(struct device *dev,
* is completed.
* */
fw_map_pages_buf(fw_buf);
+ list_del_init(&fw_buf->list);
complete_all(&fw_buf->completion);
break;
}
@@ -612,7 +632,7 @@ static ssize_t firmware_loading_store(struct device *dev,
dev_err(dev, "%s: unexpected value (%d)\n", __func__, loading);
/* fallthrough */
case -1:
- fw_load_abort(fw_priv);
+ fw_load_abort(fw_buf);
break;
}
out:
@@ -680,7 +700,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
new_pages = kmalloc(new_array_size * sizeof(void *),
GFP_KERNEL);
if (!new_pages) {
- fw_load_abort(fw_priv);
+ fw_load_abort(buf);
return -ENOMEM;
}
memcpy(new_pages, buf->pages,
@@ -697,7 +717,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
alloc_page(GFP_KERNEL | __GFP_HIGHMEM);

if (!buf->pages[buf->nr_pages]) {
- fw_load_abort(fw_priv);
+ fw_load_abort(buf);
return -ENOMEM;
}
buf->nr_pages++;
@@ -781,7 +801,7 @@ static void firmware_class_timeout_work(struct work_struct *work)
mutex_unlock(&fw_lock);
return;
}
- fw_load_abort(fw_priv);
+ fw_load_abort(fw_priv->buf);
mutex_unlock(&fw_lock);
}

@@ -857,6 +877,10 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent,
kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD);
}

+ mutex_lock(&fw_lock);
+ list_add(&buf->list, &pending_fw_head);
+ mutex_unlock(&fw_lock);
+
wait_for_completion(&buf->completion);

cancel_delayed_work_sync(&fw_priv->timeout_work);
@@ -1517,6 +1541,7 @@ static int __init firmware_class_init(void)
{
fw_cache_init();
#ifdef CONFIG_FW_LOADER_USER_HELPER
+ register_reboot_notifier(&fw_shutdown_nb);
return class_register(&firmware_class);
#else
return 0;
@@ -1530,6 +1555,7 @@ static void __exit firmware_class_exit(void)
unregister_pm_notifier(&fw_cache.pm_notify);
#endif
#ifdef CONFIG_FW_LOADER_USER_HELPER
+ unregister_reboot_notifier(&fw_shutdown_nb);
class_unregister(&firmware_class);
#endif
}
--
1.8.2.1

2013-05-08 18:47:09

by Kay Sievers

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

On Wed, May 8, 2013 at 6:26 PM, Takashi Iwai <[email protected]> wrote:
> At Thu, 9 May 2013 00:07:17 +0800,
> Ming Lei wrote:

>> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
>> > Hi,
>> >
>> > this is a series of patches for the issue we faced in the firmware
>> > loader code during debugging the problem with dell_rbu driver with
>> > 3.9 kernel.
>> >
>> > The original problem was that the shutdown gets stuck when DELL BIOS
>> > update is performed. This turned out to be a problem in the firmware
>> > loader. Although the reason of dell_rbu driver breakage is still
>>
>> Sorry, from these patchset, I can't see why it is a problem in firmware.
>>
>> > unclear, we should fix the firmware loader side, at least, not to
>> > stall during shutdown.
>>
>> Firstly you need to describe what/why is the stall? In fact, firmware
>> loading can't stall forever and it will timeout, but the current 60sec
>> timeout might be too long.
>
> The timeout check is activated only when uevent flag is set, and
> dell_rbu driver doesn't set it explicitly (because it's not supposed
> to be handled via udev or whatever).

These use the firmware loader not in the way the interface was intended:
drivers/misc/lattice-ecp3-config.c
drivers/firmware/dell_rbu.c

They just use the mechanism without any of the usual userspace setup.
It's really a nasty hack to hijack the interface that way.

The commit 6e3eaab02028c4087a92711b20abb9e72cc803a7 is a pretty broken
idea to start with. If something triggers uevents during runtime which
is not uncommon, these on-demand silently created firmware devices
would get really confused and race against the udev firmware loader
which cancels the events.

As if the userspace firmware loading in general wasn't a bad enough
idea already. :)

Kay

2013-05-09 01:19:56

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 2/3] firmware: Avoid deadlock of usermodehelper lock at shutdown

On Thu, May 9, 2013 at 12:15 AM, Takashi Iwai <[email protected]> wrote:
> At Wed, 8 May 2013 23:56:51 +0800,
> Ming Lei wrote:
>>
>> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
>> > When a system goes to reboot/shutdown, it tries to disable the
>> > usermode helper via usermodehelper_disable(). This might be blocked
>> > when a driver tries to load a firmware beforehand and it's stuck by
>> > some reason.
>>
>> IMO, it is better to find why the loading is stuck. Also we already provides
>> the timeout sysfs file to help to deal with the situation.
>
> The loading is done manually in the case of dell_rbu driver, so it may
> happen at any time that the f/w loading doesn't finish properly.
>
>
>> > In this patch, the firmware class driver registers a reboot notifier
>> > so that it can abort all pending f/w bufs. Also enable a flag for
>> > avoiding the call of usermodehelper after the reboot/shutdown starts.
>>
>> With this patch, maybe we only hide the real problem.
>
> No, you can simulate the hang easily. Try the below (you can run it
> on every x86 machine; it just loads the data onto the memory.)
>
> - modprobe dell_rbu
>
> - echo init > /sys/devices/platform/dell_rbu/image_type
>
> (... now /sys/class/firmware/dell_rbu/* appear)
>
> - echo 1 > /sys/class/firmware/dell_rbu/loading
>
> - halt -p
>
> Now the machine gets stuck.
>
> Yes, the real problem is obvious: you didn't finish the f/w loading in
> the above, and it will never happen. This doesn't justify that the
> machine can be stalled at shutdown, though.

OK, got it, thanks for your explanation, but anyway the stall is caused
by not concluding the load in userspace(we have document about it),
not by firmware loader itself.

I think there is no reason to disable the loading timeout for !event, could
you test the below patch to see if it may avoid the stall?

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 4b1f926..caf399e 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -848,11 +848,12 @@ static int _request_firmware_load(struct
firmware_priv *fw_priv, bool uevent,
goto err_del_bin_attr;
}

+ if (timeout != MAX_SCHEDULE_TIMEOUT)
+ schedule_delayed_work(&fw_priv->timeout_work, timeout);
+
if (uevent) {
dev_set_uevent_suppress(f_dev, false);
dev_dbg(f_dev, "firmware: requesting %s\n", buf->fw_id);
- if (timeout != MAX_SCHEDULE_TIMEOUT)
- schedule_delayed_work(&fw_priv->timeout_work, timeout);

kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD);
}


Thanks,
--
Ming Lei

2013-05-09 01:25:39

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
>> In other words, the first patch is no essential part of the fix.
>> I can revisit the second patch without this one and resend if
>> preferred.
>
> FWIW, below is the revised patch.
> It's alone without the patch 1 in the previous series.

The root cause is that your user space loader doesn't follow the
current firmware loader interface.

IMO, the patch is unnecessary since we already have the timeout
abort(just need one patch to enable it for nowait api)

Thanks,
--
Ming Lei

2013-05-09 07:26:10

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

At Wed, 8 May 2013 20:46:43 +0200,
Kay Sievers wrote:
>
> On Wed, May 8, 2013 at 6:26 PM, Takashi Iwai <[email protected]> wrote:
> > At Thu, 9 May 2013 00:07:17 +0800,
> > Ming Lei wrote:
>
> >> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> >> > Hi,
> >> >
> >> > this is a series of patches for the issue we faced in the firmware
> >> > loader code during debugging the problem with dell_rbu driver with
> >> > 3.9 kernel.
> >> >
> >> > The original problem was that the shutdown gets stuck when DELL BIOS
> >> > update is performed. This turned out to be a problem in the firmware
> >> > loader. Although the reason of dell_rbu driver breakage is still
> >>
> >> Sorry, from these patchset, I can't see why it is a problem in firmware.
> >>
> >> > unclear, we should fix the firmware loader side, at least, not to
> >> > stall during shutdown.
> >>
> >> Firstly you need to describe what/why is the stall? In fact, firmware
> >> loading can't stall forever and it will timeout, but the current 60sec
> >> timeout might be too long.
> >
> > The timeout check is activated only when uevent flag is set, and
> > dell_rbu driver doesn't set it explicitly (because it's not supposed
> > to be handled via udev or whatever).
>
> These use the firmware loader not in the way the interface was intended:
> drivers/misc/lattice-ecp3-config.c
> drivers/firmware/dell_rbu.c
>
> They just use the mechanism without any of the usual userspace setup.
> It's really a nasty hack to hijack the interface that way.
>
> The commit 6e3eaab02028c4087a92711b20abb9e72cc803a7 is a pretty broken
> idea to start with. If something triggers uevents during runtime which
> is not uncommon, these on-demand silently created firmware devices
> would get really confused and race against the udev firmware loader
> which cancels the events.
>
> As if the userspace firmware loading in general wasn't a bad enough
> idea already. :)

Luckily there are only few users, so conversion of such known drivers
should be easy. But the major question is whether we are allowed to
drop the non-hotplug f/w loading right now after years. It means a
clear user-space breakage, and we don't want it as much as possible.


Takashi

2013-05-09 07:31:16

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Thu, 9 May 2013 09:25:35 +0800,
Ming Lei wrote:
>
> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
> >> In other words, the first patch is no essential part of the fix.
> >> I can revisit the second patch without this one and resend if
> >> preferred.
> >
> > FWIW, below is the revised patch.
> > It's alone without the patch 1 in the previous series.
>
> The root cause is that your user space loader doesn't follow the
> current firmware loader interface.

There is not necessarily a user space "loader". It's declared as
non-hotplug, thus it can be a manual operation by human.

> IMO, the patch is unnecessary since we already have the timeout
> abort(just need one patch to enable it for nowait api)

Well, you cannot know any sane value for such human's operation.
If it's a system response, then we can assume something. But the
invocation of request_firmware_nowait() with non-hotplug means that
you can never know the actual use case, thus you cannot know any sane
timeout, too.

And secondly, I don't think it's good to rely on the timeout. Why
does the system have to wait for minute for shutdown? The system is
in shutdown, and it's triggered by user. It's more natural to abort
the pending f/w loading because you don't want to handle it any
longer after the system shuts down.

I guess the same problem remains in the suspend/hibernation.


thanks,

Takashi

2013-05-09 07:34:28

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 2/3] firmware: Avoid deadlock of usermodehelper lock at shutdown

At Thu, 9 May 2013 09:19:47 +0800,
Ming Lei wrote:
>
> On Thu, May 9, 2013 at 12:15 AM, Takashi Iwai <[email protected]> wrote:
> > At Wed, 8 May 2013 23:56:51 +0800,
> > Ming Lei wrote:
> >>
> >> On Wed, May 8, 2013 at 2:56 PM, Takashi Iwai <[email protected]> wrote:
> >> > When a system goes to reboot/shutdown, it tries to disable the
> >> > usermode helper via usermodehelper_disable(). This might be blocked
> >> > when a driver tries to load a firmware beforehand and it's stuck by
> >> > some reason.
> >>
> >> IMO, it is better to find why the loading is stuck. Also we already provides
> >> the timeout sysfs file to help to deal with the situation.
> >
> > The loading is done manually in the case of dell_rbu driver, so it may
> > happen at any time that the f/w loading doesn't finish properly.
> >
> >
> >> > In this patch, the firmware class driver registers a reboot notifier
> >> > so that it can abort all pending f/w bufs. Also enable a flag for
> >> > avoiding the call of usermodehelper after the reboot/shutdown starts.
> >>
> >> With this patch, maybe we only hide the real problem.
> >
> > No, you can simulate the hang easily. Try the below (you can run it
> > on every x86 machine; it just loads the data onto the memory.)
> >
> > - modprobe dell_rbu
> >
> > - echo init > /sys/devices/platform/dell_rbu/image_type
> >
> > (... now /sys/class/firmware/dell_rbu/* appear)
> >
> > - echo 1 > /sys/class/firmware/dell_rbu/loading
> >
> > - halt -p
> >
> > Now the machine gets stuck.
> >
> > Yes, the real problem is obvious: you didn't finish the f/w loading in
> > the above, and it will never happen. This doesn't justify that the
> > machine can be stalled at shutdown, though.
>
> OK, got it, thanks for your explanation, but anyway the stall is caused
> by not concluding the load in userspace(we have document about it),
> not by firmware loader itself.

It's the firmware loader who blocks the usermodelock :)

> I think there is no reason to disable the loading timeout for !event, could
> you test the below patch to see if it may avoid the stall?

Please test on your machine (e.g. on VM). I'm off today for a
holiday.
As mentioned, dell_rbu can run on every machine and it's harmless.


thanks,

Takashi

> diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
> index 4b1f926..caf399e 100644
> --- a/drivers/base/firmware_class.c
> +++ b/drivers/base/firmware_class.c
> @@ -848,11 +848,12 @@ static int _request_firmware_load(struct
> firmware_priv *fw_priv, bool uevent,
> goto err_del_bin_attr;
> }
>
> + if (timeout != MAX_SCHEDULE_TIMEOUT)
> + schedule_delayed_work(&fw_priv->timeout_work, timeout);
> +
> if (uevent) {
> dev_set_uevent_suppress(f_dev, false);
> dev_dbg(f_dev, "firmware: requesting %s\n", buf->fw_id);
> - if (timeout != MAX_SCHEDULE_TIMEOUT)
> - schedule_delayed_work(&fw_priv->timeout_work, timeout);
>
> kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD);
> }
>
>
> Thanks,
> --
> Ming Lei
>

2013-05-09 08:43:34

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Thu, May 9, 2013 at 3:31 PM, Takashi Iwai <[email protected]> wrote:
> At Thu, 9 May 2013 09:25:35 +0800,
> Ming Lei wrote:
>>
>> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
>> >> In other words, the first patch is no essential part of the fix.
>> >> I can revisit the second patch without this one and resend if
>> >> preferred.
>> >
>> > FWIW, below is the revised patch.
>> > It's alone without the patch 1 in the previous series.
>>
>> The root cause is that your user space loader doesn't follow the
>> current firmware loader interface.
>
> There is not necessarily a user space "loader". It's declared as
> non-hotplug, thus it can be a manual operation by human.
>
>> IMO, the patch is unnecessary since we already have the timeout
>> abort(just need one patch to enable it for nowait api)
>
> Well, you cannot know any sane value for such human's operation.
> If it's a system response, then we can assume something. But the
> invocation of request_firmware_nowait() with non-hotplug means that
> you can never know the actual use case, thus you cannot know any sane

I think the use case should be driver specific, and the loading is triggered
from user space in dell_rbu(write sysfs file to trigger BIOS update), so the
user has been ready for loading the image.

For another usage(lattice-ecp3-config.c), it is merged recently and very
specific(maybe only for personal use), and can be easily to change to
trigger loading from user space like dell.

I think both the two usages choose FW_ACTION_NOHOTPLUG
because they have out of tree firmware images. So looks enabling
timeout won't be a big deal for them.

> timeout, too.
>
> And secondly, I don't think it's good to rely on the timeout. Why
> does the system have to wait for minute for shutdown? The system is

It won't if the user space follows the rules.

> in shutdown, and it's triggered by user. It's more natural to abort
> the pending f/w loading because you don't want to handle it any
> longer after the system shuts down.

There is still risk to force killing the loader before shutdown or
suspend. Maybe some devices depend its firmware in shutdown
or suspend callback to configure power setting.

Thanks,
--
Ming Lei

2013-05-09 17:04:39

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Thu, 9 May 2013 16:43:28 +0800,
Ming Lei wrote:
>
> On Thu, May 9, 2013 at 3:31 PM, Takashi Iwai <[email protected]> wrote:
> > At Thu, 9 May 2013 09:25:35 +0800,
> > Ming Lei wrote:
> >>
> >> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
> >> >> In other words, the first patch is no essential part of the fix.
> >> >> I can revisit the second patch without this one and resend if
> >> >> preferred.
> >> >
> >> > FWIW, below is the revised patch.
> >> > It's alone without the patch 1 in the previous series.
> >>
> >> The root cause is that your user space loader doesn't follow the
> >> current firmware loader interface.
> >
> > There is not necessarily a user space "loader". It's declared as
> > non-hotplug, thus it can be a manual operation by human.
> >
> >> IMO, the patch is unnecessary since we already have the timeout
> >> abort(just need one patch to enable it for nowait api)
> >
> > Well, you cannot know any sane value for such human's operation.
> > If it's a system response, then we can assume something. But the
> > invocation of request_firmware_nowait() with non-hotplug means that
> > you can never know the actual use case, thus you cannot know any sane
>
> I think the use case should be driver specific, and the loading is triggered
> from user space in dell_rbu(write sysfs file to trigger BIOS update), so the
> user has been ready for loading the image.

Yes, it's ready, but it doesn't guarantee that it's done in which time
limit. That's the uncertain point in such an interface.

If it were hotplug via udev, we can assume some sane time limit.
But in a scenario like the above, with the manual intervention, we
can't know what is the sane value in a single manner.

> For another usage(lattice-ecp3-config.c), it is merged recently and very
> specific(maybe only for personal use), and can be easily to change to
> trigger loading from user space like dell.
>
> I think both the two usages choose FW_ACTION_NOHOTPLUG
> because they have out of tree firmware images. So looks enabling
> timeout won't be a big deal for them.

Yeah, but it's a guess.

So, in these use cases, the practical impact would be small, I agree.
Changing this (adding a timeout unconditionally) eases the shutdown
stall. But I still feel this is no essential fix, and in general,
changing the user-space behavior has to be done really carefully.

> > timeout, too.
> >
> > And secondly, I don't think it's good to rely on the timeout. Why
> > does the system have to wait for minute for shutdown? The system is
>
> It won't if the user space follows the rules.

Well... what rule? The kernel shutdown must be blocked when user
space doesn't write 0 or -1 to /sys/class/firmware/*/loading?
If you meant it, I would say that the rule is wrong. There is no big
reason to block the *kernel* shutdown by such an action.

We're handling the moment where the system should be really shut down,
already after all user-space things are synced and killed. For
example, would we delay the shutdown until all opened files are
closed even at this point? The kernel doesn't do so.

> > in shutdown, and it's triggered by user. It's more natural to abort
> > the pending f/w loading because you don't want to handle it any
> > longer after the system shuts down.
>
> There is still risk to force killing the loader before shutdown or
> suspend. Maybe some devices depend its firmware in shutdown
> or suspend callback to configure power setting.

The firmware loading is never guaranteed to succeed. If the abort of
f/w loading at shutdown would cause any problem, it means that the
driver is fundamentally buggy, and we must fix it inevitably anyway.

Of course, the suspend is a bit different issue. Maybe better to
retry the load after resume instead of forcibly aborting.
My point is that, if such critical kernel behavior like suspend or
shutdown rely on a timeout of user actions, it's badly designed.


thanks,

Takashi

2013-05-10 01:25:56

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Fri, May 10, 2013 at 1:04 AM, Takashi Iwai <[email protected]> wrote:
> At Thu, 9 May 2013 16:43:28 +0800,
> Ming Lei wrote:
>>
>> On Thu, May 9, 2013 at 3:31 PM, Takashi Iwai <[email protected]> wrote:
>> > At Thu, 9 May 2013 09:25:35 +0800,
>> > Ming Lei wrote:
>> >>
>> >> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
>> >> >> In other words, the first patch is no essential part of the fix.
>> >> >> I can revisit the second patch without this one and resend if
>> >> >> preferred.
>> >> >
>> >> > FWIW, below is the revised patch.
>> >> > It's alone without the patch 1 in the previous series.
>> >>
>> >> The root cause is that your user space loader doesn't follow the
>> >> current firmware loader interface.
>> >
>> > There is not necessarily a user space "loader". It's declared as
>> > non-hotplug, thus it can be a manual operation by human.
>> >
>> >> IMO, the patch is unnecessary since we already have the timeout
>> >> abort(just need one patch to enable it for nowait api)
>> >
>> > Well, you cannot know any sane value for such human's operation.
>> > If it's a system response, then we can assume something. But the
>> > invocation of request_firmware_nowait() with non-hotplug means that
>> > you can never know the actual use case, thus you cannot know any sane
>>
>> I think the use case should be driver specific, and the loading is triggered
>> from user space in dell_rbu(write sysfs file to trigger BIOS update), so the
>> user has been ready for loading the image.
>
> Yes, it's ready, but it doesn't guarantee that it's done in which time
> limit. That's the uncertain point in such an interface.
>
> If it were hotplug via udev, we can assume some sane time limit.
> But in a scenario like the above, with the manual intervention, we
> can't know what is the sane value in a single manner.
>
>> For another usage(lattice-ecp3-config.c), it is merged recently and very
>> specific(maybe only for personal use), and can be easily to change to
>> trigger loading from user space like dell.
>>
>> I think both the two usages choose FW_ACTION_NOHOTPLUG
>> because they have out of tree firmware images. So looks enabling
>> timeout won't be a big deal for them.
>
> Yeah, but it's a guess.
>
> So, in these use cases, the practical impact would be small, I agree.
> Changing this (adding a timeout unconditionally) eases the shutdown
> stall. But I still feel this is no essential fix, and in general,
> changing the user-space behavior has to be done really carefully.

If there are lots of drivers which are using FW_ACTION_NOHOTPLUG,
I agree with you. But the fact is that only two, and both uses firmware
images which aren't shipped by distribution, so it isn't a big deal, is it?

>
>> > timeout, too.
>> >
>> > And secondly, I don't think it's good to rely on the timeout. Why
>> > does the system have to wait for minute for shutdown? The system is
>>
>> It won't if the user space follows the rules.
>
> Well... what rule? The kernel shutdown must be blocked when user
> space doesn't write 0 or -1 to /sys/class/firmware/*/loading?

Yes.

> If you meant it, I would say that the rule is wrong. There is no big

Why? It has been here for ten years.

> reason to block the *kernel* shutdown by such an action.

The delay just shows that the user doesn't follow the rule.

>
> We're handling the moment where the system should be really shut down,
> already after all user-space things are synced and killed. For
> example, would we delay the shutdown until all opened files are
> closed even at this point? The kernel doesn't do so.

shutdown isn't unconditional, for example, page cache must be flushed
to disk.

>
>> > in shutdown, and it's triggered by user. It's more natural to abort
>> > the pending f/w loading because you don't want to handle it any
>> > longer after the system shuts down.
>>
>> There is still risk to force killing the loader before shutdown or
>> suspend. Maybe some devices depend its firmware in shutdown
>> or suspend callback to configure power setting.
>
> The firmware loading is never guaranteed to succeed. If the abort of

Yes, but it is guaranteed that it will be ended in some time, and your
problem is nothing to do with success, just about timing.

> f/w loading at shutdown would cause any problem, it means that the
> driver is fundamentally buggy, and we must fix it inevitably anyway.
>
> Of course, the suspend is a bit different issue. Maybe better to
> retry the load after resume instead of forcibly aborting.
> My point is that, if such critical kernel behavior like suspend or
> shutdown rely on a timeout of user actions, it's badly designed.

Anyway, if you want to force killing loader, please only kill these
FW_ACTION_NOHOTPLUG before suspend and reboot, and do
not touch FW_ACTION_HOTPLUG. Is it OK for you?

Thanks,
--
Ming Lei

2013-05-10 09:32:24

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Fri, 10 May 2013 09:25:51 +0800,
Ming Lei wrote:
>
> On Fri, May 10, 2013 at 1:04 AM, Takashi Iwai <[email protected]> wrote:
> > At Thu, 9 May 2013 16:43:28 +0800,
> > Ming Lei wrote:
> >>
> >> On Thu, May 9, 2013 at 3:31 PM, Takashi Iwai <[email protected]> wrote:
> >> > At Thu, 9 May 2013 09:25:35 +0800,
> >> > Ming Lei wrote:
> >> >>
> >> >> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
> >> >> >> In other words, the first patch is no essential part of the fix.
> >> >> >> I can revisit the second patch without this one and resend if
> >> >> >> preferred.
> >> >> >
> >> >> > FWIW, below is the revised patch.
> >> >> > It's alone without the patch 1 in the previous series.
> >> >>
> >> >> The root cause is that your user space loader doesn't follow the
> >> >> current firmware loader interface.
> >> >
> >> > There is not necessarily a user space "loader". It's declared as
> >> > non-hotplug, thus it can be a manual operation by human.
> >> >
> >> >> IMO, the patch is unnecessary since we already have the timeout
> >> >> abort(just need one patch to enable it for nowait api)
> >> >
> >> > Well, you cannot know any sane value for such human's operation.
> >> > If it's a system response, then we can assume something. But the
> >> > invocation of request_firmware_nowait() with non-hotplug means that
> >> > you can never know the actual use case, thus you cannot know any sane
> >>
> >> I think the use case should be driver specific, and the loading is triggered
> >> from user space in dell_rbu(write sysfs file to trigger BIOS update), so the
> >> user has been ready for loading the image.
> >
> > Yes, it's ready, but it doesn't guarantee that it's done in which time
> > limit. That's the uncertain point in such an interface.
> >
> > If it were hotplug via udev, we can assume some sane time limit.
> > But in a scenario like the above, with the manual intervention, we
> > can't know what is the sane value in a single manner.
> >
> >> For another usage(lattice-ecp3-config.c), it is merged recently and very
> >> specific(maybe only for personal use), and can be easily to change to
> >> trigger loading from user space like dell.
> >>
> >> I think both the two usages choose FW_ACTION_NOHOTPLUG
> >> because they have out of tree firmware images. So looks enabling
> >> timeout won't be a big deal for them.
> >
> > Yeah, but it's a guess.
> >
> > So, in these use cases, the practical impact would be small, I agree.
> > Changing this (adding a timeout unconditionally) eases the shutdown
> > stall. But I still feel this is no essential fix, and in general,
> > changing the user-space behavior has to be done really carefully.
>
> If there are lots of drivers which are using FW_ACTION_NOHOTPLUG,
> I agree with you. But the fact is that only two, and both uses firmware
> images which aren't shipped by distribution, so it isn't a big deal, is it?

Yeah, this particular thing could be eased by the timeout, but it's
no real "fix". The blocking behavior is just a non-sense there.

> >> > timeout, too.
> >> >
> >> > And secondly, I don't think it's good to rely on the timeout. Why
> >> > does the system have to wait for minute for shutdown? The system is
> >>
> >> It won't if the user space follows the rules.
> >
> > Well... what rule? The kernel shutdown must be blocked when user
> > space doesn't write 0 or -1 to /sys/class/firmware/*/loading?
>
> Yes.
>
> > If you meant it, I would say that the rule is wrong. There is no big
>
> Why? It has been here for ten years.

No. This is no old behavior (in the sense of enterprise products :)
It was introduced since 3.3 or 3.4 kernel, and this bug actually hits
our customer now. That's why I had to work on this nasty thing...
Before 3.3/3.4, there was no blocking behavior at shutdown.

And I bet that this problem was simply overseen at that time. The fix
at that time was basically for the race at suspend/resume.

> > reason to block the *kernel* shutdown by such an action.
>
> The delay just shows that the user doesn't follow the rule.

Again, the rule is just wrong.

> > We're handling the moment where the system should be really shut down,
> > already after all user-space things are synced and killed. For
> > example, would we delay the shutdown until all opened files are
> > closed even at this point? The kernel doesn't do so.
>
> shutdown isn't unconditional, for example, page cache must be flushed
> to disk.

Your example doesn't fit really. "Can user-space block/delay the
suspend at the point where kernel_restart() or kernel_halt() is
called?" That's the question. The page cache flush before shutdown is
a pure kernel action, and there is no user-space involvement to
block/delay.

> >> > in shutdown, and it's triggered by user. It's more natural to abort
> >> > the pending f/w loading because you don't want to handle it any
> >> > longer after the system shuts down.
> >>
> >> There is still risk to force killing the loader before shutdown or
> >> suspend. Maybe some devices depend its firmware in shutdown
> >> or suspend callback to configure power setting.
> >
> > The firmware loading is never guaranteed to succeed. If the abort of
>
> Yes, but it is guaranteed that it will be ended in some time, and your
> problem is nothing to do with success, just about timing.

There is no need for wait at that point, and this is a clear
regression in comparison with the old kernels.

> > f/w loading at shutdown would cause any problem, it means that the
> > driver is fundamentally buggy, and we must fix it inevitably anyway.
> >
> > Of course, the suspend is a bit different issue. Maybe better to
> > retry the load after resume instead of forcibly aborting.
> > My point is that, if such critical kernel behavior like suspend or
> > shutdown rely on a timeout of user actions, it's badly designed.
>
> Anyway, if you want to force killing loader, please only kill these
> FW_ACTION_NOHOTPLUG before suspend and reboot, and do
> not touch FW_ACTION_HOTPLUG. Is it OK for you?

Note that, as with my patch, only the shutdown case is handled. Let's
not mixing up suspend and shutdown behavior for now.

I see no reason why we need to wait at shutdown even for
FW_ACTION_HOTPLUG. At that point, there should be no longer
user-space action. It means that the driver shouldn't get any more
data to finish the f/w loading upon that point. Thus the only
possible consequence is the timeout, which is equivalent with the
immediate abort of the operation.

As mentioned earlier, the suspend behavior may be different. We want
to retry the f/w load. Ideally, the f/w loader should abort and
automatically retry after resume. In this case, also there is no big
reason to distinguish FW_ACTION_* types. Even for udev case, the
action can be easily retried.

Or, a cleaner solution would be to get rid of FW_ACTION_NOHOTPLUG
completely. As Kay mentioned, this was a big mistake from the very
beginning. Fortunately, there are little users, so if we are allowed
to change the interface, it'd be relatively easy to drop this mode.
And the rest cases are usually covered by the direct f/w loading.


thanks,

Takashi

2013-05-11 13:01:32

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Fri, May 10, 2013 at 5:32 PM, Takashi Iwai <[email protected]> wrote:
> At Fri, 10 May 2013 09:25:51 +0800,
>>
>> Anyway, if you want to force killing loader, please only kill these
>> FW_ACTION_NOHOTPLUG before suspend and reboot, and do
>> not touch FW_ACTION_HOTPLUG. Is it OK for you?
>
> Note that, as with my patch, only the shutdown case is handled. Let's
> not mixing up suspend and shutdown behavior for now.
>
> I see no reason why we need to wait at shutdown even for
> FW_ACTION_HOTPLUG. At that point, there should be no longer
> user-space action. It means that the driver shouldn't get any more
> data to finish the f/w loading upon that point. Thus the only

For example, when one ethernet driver is requesting its firmware,
the loader is forced to be killed before shutdown, so the driver can't set the
WoL any more in its shutdown(), then users start to complain it is a
regression.

That is why I suggest you to only kill requests of FW_ACTION_NOHOTPLUG.

Also it isn't good to affect all drivers just for fixing two insane drivers.

> possible consequence is the timeout, which is equivalent with the
> immediate abort of the operation.
>
> As mentioned earlier, the suspend behavior may be different. We want
> to retry the f/w load. Ideally, the f/w loader should abort and
> automatically retry after resume. In this case, also there is no big

I don't think it is good idea since suspend() may need firmware to
change the device's power state. Considered that only two insane
drivers use FW_ACTION_NOHOTPLUG, we can kill the two before
suspend just like the case of shutdown.

> reason to distinguish FW_ACTION_* types. Even for udev case, the
> action can be easily retried.
>
> Or, a cleaner solution would be to get rid of FW_ACTION_NOHOTPLUG
> completely. As Kay mentioned, this was a big mistake from the very

It is still not a good idea, hackers may need FW_ACTION_NOHOTPLUG
to debug its driver with private firmwares, or examples like dell's BIOS update.


Thanks,
--
Ming Lei

2013-05-12 07:20:51

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Sat, 11 May 2013 21:01:27 +0800,
Ming Lei wrote:
>
> On Fri, May 10, 2013 at 5:32 PM, Takashi Iwai <[email protected]> wrote:
> > At Fri, 10 May 2013 09:25:51 +0800,
> >>
> >> Anyway, if you want to force killing loader, please only kill these
> >> FW_ACTION_NOHOTPLUG before suspend and reboot, and do
> >> not touch FW_ACTION_HOTPLUG. Is it OK for you?
> >
> > Note that, as with my patch, only the shutdown case is handled. Let's
> > not mixing up suspend and shutdown behavior for now.
> >
> > I see no reason why we need to wait at shutdown even for
> > FW_ACTION_HOTPLUG. At that point, there should be no longer
> > user-space action. It means that the driver shouldn't get any more
> > data to finish the f/w loading upon that point. Thus the only
>
> For example, when one ethernet driver is requesting its firmware,
> the loader is forced to be killed before shutdown, so the driver can't set the
> WoL any more in its shutdown(), then users start to complain it is a
> regression.

First off, this can't happen because, as mentioned earlier, the point
we're discussing is already the moment after user-space is supposed to
have been finished by init, i.e. there is already no udev. Thus the
pending f/w request via usermode helper can never finish. So, it
makes no sense to wait for any user-space action at that point. It
just locks up.

Secondly, if this would ever work, it's still just a luck. The
protection is only about the usermode helper lock in the f/w loading
code. This doesn't mean that the whole pending driver work would be
finished. In other words, such a driver design is just broken.

> That is why I suggest you to only kill requests of FW_ACTION_NOHOTPLUG.
>
> Also it isn't good to affect all drivers just for fixing two insane drivers.
>
> > possible consequence is the timeout, which is equivalent with the
> > immediate abort of the operation.
> >
> > As mentioned earlier, the suspend behavior may be different. We want
> > to retry the f/w load. Ideally, the f/w loader should abort and
> > automatically retry after resume. In this case, also there is no big
>
> I don't think it is good idea since suspend() may need firmware to
> change the device's power state. Considered that only two insane
> drivers use FW_ACTION_NOHOTPLUG, we can kill the two before
> suspend just like the case of shutdown.

The same argument can be applied. The protection in f/w loading
doesn't guarantee that the pending driver action will be finished
before suspend. It protects only the udev reaction. But, it doesn't
protect the action after it.

> > reason to distinguish FW_ACTION_* types. Even for udev case, the
> > action can be easily retried.
> >
> > Or, a cleaner solution would be to get rid of FW_ACTION_NOHOTPLUG
> > completely. As Kay mentioned, this was a big mistake from the very
>
> It is still not a good idea, hackers may need FW_ACTION_NOHOTPLUG
> to debug its driver with private firmwares, or examples like dell's BIOS update.

Oh no, it's a badly designed interface. It should be never used.


Takashi

2013-05-12 13:32:44

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Sun, May 12, 2013 at 3:20 PM, Takashi Iwai <[email protected]> wrote:
> At Sat, 11 May 2013 21:01:27 +0800,
> Ming Lei wrote:
>>
>> On Fri, May 10, 2013 at 5:32 PM, Takashi Iwai <[email protected]> wrote:
>> > At Fri, 10 May 2013 09:25:51 +0800,
>> >>
>> >> Anyway, if you want to force killing loader, please only kill these
>> >> FW_ACTION_NOHOTPLUG before suspend and reboot, and do
>> >> not touch FW_ACTION_HOTPLUG. Is it OK for you?
>> >
>> > Note that, as with my patch, only the shutdown case is handled. Let's
>> > not mixing up suspend and shutdown behavior for now.
>> >
>> > I see no reason why we need to wait at shutdown even for
>> > FW_ACTION_HOTPLUG. At that point, there should be no longer
>> > user-space action. It means that the driver shouldn't get any more
>> > data to finish the f/w loading upon that point. Thus the only
>>
>> For example, when one ethernet driver is requesting its firmware,
>> the loader is forced to be killed before shutdown, so the driver can't set the
>> WoL any more in its shutdown(), then users start to complain it is a
>> regression.
>
> First off, this can't happen because, as mentioned earlier, the point
> we're discussing is already the moment after user-space is supposed to
> have been finished by init, i.e. there is already no udev. Thus the

OK, sorry for missing the fact which user-space application may have
been killed before calling sys_reboot(), even though the fact depends on
implementation of user space 'reboot'.

> pending f/w request via usermode helper can never finish. So, it
> makes no sense to wait for any user-space action at that point. It
> just locks up.
>
> Secondly, if this would ever work, it's still just a luck. The
> protection is only about the usermode helper lock in the f/w loading
> code. This doesn't mean that the whole pending driver work would be
> finished. In other words, such a driver design is just broken.

A well-implemented driver should make sure the synchronization,
looks it isn't related with firmware loading.

>
>> That is why I suggest you to only kill requests of FW_ACTION_NOHOTPLUG.
>>
>> Also it isn't good to affect all drivers just for fixing two insane drivers.
>>
>> > possible consequence is the timeout, which is equivalent with the
>> > immediate abort of the operation.
>> >
>> > As mentioned earlier, the suspend behavior may be different. We want
>> > to retry the f/w load. Ideally, the f/w loader should abort and
>> > automatically retry after resume. In this case, also there is no big
>>
>> I don't think it is good idea since suspend() may need firmware to
>> change the device's power state. Considered that only two insane
>> drivers use FW_ACTION_NOHOTPLUG, we can kill the two before
>> suspend just like the case of shutdown.
>
> The same argument can be applied. The protection in f/w loading
> doesn't guarantee that the pending driver action will be finished

The driver itself can make sure any synchronization it needed.(
probe vs. suspend, or open vs. suspend), and nothing to do with
request_firmware().

> before suspend. It protects only the udev reaction. But, it doesn't
> protect the action after it.

Also suspend is different with shutdown since usermodehelper lock
is required to be held before freezing processes, so firmware loading
should be completed before suspending.

>
>> > reason to distinguish FW_ACTION_* types. Even for udev case, the
>> > action can be easily retried.
>> >
>> > Or, a cleaner solution would be to get rid of FW_ACTION_NOHOTPLUG
>> > completely. As Kay mentioned, this was a big mistake from the very
>>
>> It is still not a good idea, hackers may need FW_ACTION_NOHOTPLUG
>> to debug its driver with private firmwares, or examples like dell's BIOS update.
>
> Oh no, it's a badly designed interface. It should be never used.

Yes, it is bad, but killing FW_ACTION_NOHOTPLUG may break dell's BIOS
update utility, also how can we meet the demand of drivers which need private
firmwares?

Thanks,
--
Ming Lei

2013-05-12 13:59:55

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
>
> ---
> From: Takashi Iwai <[email protected]>
> Subject: [PATCH v2] firmware: Avoid deadlock of usermodehelper lock at shutdown
>
> When a system goes to reboot/shutdown, it tries to disable the
> usermode helper via usermodehelper_disable(). This might be blocked
> when a driver tries to load a firmware beforehand and it's stuck by
> some reason.
>
> In this patch, the firmware class driver registers a reboot notifier
> so that it can abort all pending f/w bufs. Also enable a flag for
> avoiding the call of usermodehelper after the reboot/shutdown starts.
>
> Signed-off-by: Takashi Iwai <[email protected]>
> ---
> drivers/base/firmware_class.c | 40 +++++++++++++++++++++++++++++++++-------
> 1 file changed, 33 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
> index 4b1f926..972e535 100644
> --- a/drivers/base/firmware_class.c
> +++ b/drivers/base/firmware_class.c
> @@ -27,6 +27,7 @@
> #include <linux/pm.h>
> #include <linux/suspend.h>
> #include <linux/syscore_ops.h>
> +#include <linux/reboot.h>
>
> #include <generated/utsrelease.h>
>
> @@ -171,6 +172,7 @@ static struct firmware_buf *__allocate_fw_buf(const char *fw_name,
> strcpy(buf->fw_id, fw_name);
> buf->fwc = fwc;
> init_completion(&buf->completion);
> + INIT_LIST_HEAD(&buf->list);

You should introduce one extra field(such as, 'list_abort') in
'struct firmware_buf' and the field of 'list' is for firmware caching now.
Also, INIT_LIST_HEAD isn't needed here.

>
> pr_debug("%s: fw-%s buf=%p\n", __func__, fw_name, buf);
>
> @@ -446,10 +448,9 @@ static struct firmware_priv *to_firmware_priv(struct device *dev)
> return container_of(dev, struct firmware_priv, dev);
> }
>
> -static void fw_load_abort(struct firmware_priv *fw_priv)
> +static void fw_load_abort(struct firmware_buf *buf)
> {
> - struct firmware_buf *buf = fw_priv->buf;
> -
> + list_del_init(&buf->list);
> set_bit(FW_STATUS_ABORT, &buf->status);
> complete_all(&buf->completion);
> }
> @@ -457,6 +458,24 @@ static void fw_load_abort(struct firmware_priv *fw_priv)
> #define is_fw_load_aborted(buf) \
> test_bit(FW_STATUS_ABORT, &(buf)->status)
>
> +static LIST_HEAD(pending_fw_head);
> +
> +/* reboot notifier for avoid deadlock with usermode_lock */
> +static int fw_shutdown_notify(struct notifier_block *unused1,
> + unsigned long unused2, void *unused3)
> +{
> + mutex_lock(&fw_lock);
> + while (!list_empty(&pending_fw_head))
> + fw_load_abort(list_first_entry(&pending_fw_head,
> + struct firmware_buf, list));
> + mutex_unlock(&fw_lock);
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block fw_shutdown_nb = {
> + .notifier_call = fw_shutdown_notify,
> +};
> +
> static ssize_t firmware_timeout_show(struct class *class,
> struct class_attribute *attr,
> char *buf)
> @@ -604,6 +623,7 @@ static ssize_t firmware_loading_store(struct device *dev,
> * is completed.
> * */
> fw_map_pages_buf(fw_buf);
> + list_del_init(&fw_buf->list);
> complete_all(&fw_buf->completion);
> break;
> }
> @@ -612,7 +632,7 @@ static ssize_t firmware_loading_store(struct device *dev,
> dev_err(dev, "%s: unexpected value (%d)\n", __func__, loading);
> /* fallthrough */
> case -1:
> - fw_load_abort(fw_priv);
> + fw_load_abort(fw_buf);
> break;
> }
> out:
> @@ -680,7 +700,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
> new_pages = kmalloc(new_array_size * sizeof(void *),
> GFP_KERNEL);
> if (!new_pages) {
> - fw_load_abort(fw_priv);
> + fw_load_abort(buf);
> return -ENOMEM;
> }
> memcpy(new_pages, buf->pages,
> @@ -697,7 +717,7 @@ static int fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
> alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
>
> if (!buf->pages[buf->nr_pages]) {
> - fw_load_abort(fw_priv);
> + fw_load_abort(buf);
> return -ENOMEM;
> }
> buf->nr_pages++;
> @@ -781,7 +801,7 @@ static void firmware_class_timeout_work(struct work_struct *work)
> mutex_unlock(&fw_lock);
> return;
> }
> - fw_load_abort(fw_priv);
> + fw_load_abort(fw_priv->buf);
> mutex_unlock(&fw_lock);
> }
>
> @@ -857,6 +877,10 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent,
> kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD);
> }
>
> + mutex_lock(&fw_lock);
> + list_add(&buf->list, &pending_fw_head);
> + mutex_unlock(&fw_lock);
> +
> wait_for_completion(&buf->completion);
>
> cancel_delayed_work_sync(&fw_priv->timeout_work);
> @@ -1517,6 +1541,7 @@ static int __init firmware_class_init(void)
> {
> fw_cache_init();
> #ifdef CONFIG_FW_LOADER_USER_HELPER
> + register_reboot_notifier(&fw_shutdown_nb);
> return class_register(&firmware_class);
> #else
> return 0;
> @@ -1530,6 +1555,7 @@ static void __exit firmware_class_exit(void)
> unregister_pm_notifier(&fw_cache.pm_notify);
> #endif
> #ifdef CONFIG_FW_LOADER_USER_HELPER
> + unregister_reboot_notifier(&fw_shutdown_nb);
> class_unregister(&firmware_class);
> #endif
> }
> --
> 1.8.2.1

Thanks,
--
Ming Lei

2013-05-13 14:48:55

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Sun, 12 May 2013 21:32:39 +0800,
Ming Lei wrote:
>
> On Sun, May 12, 2013 at 3:20 PM, Takashi Iwai <[email protected]> wrote:
> > At Sat, 11 May 2013 21:01:27 +0800,
> > Ming Lei wrote:
> >>
> >> On Fri, May 10, 2013 at 5:32 PM, Takashi Iwai <[email protected]> wrote:
> >> > At Fri, 10 May 2013 09:25:51 +0800,
> >> >>
> >> >> Anyway, if you want to force killing loader, please only kill these
> >> >> FW_ACTION_NOHOTPLUG before suspend and reboot, and do
> >> >> not touch FW_ACTION_HOTPLUG. Is it OK for you?
> >> >
> >> > Note that, as with my patch, only the shutdown case is handled. Let's
> >> > not mixing up suspend and shutdown behavior for now.
> >> >
> >> > I see no reason why we need to wait at shutdown even for
> >> > FW_ACTION_HOTPLUG. At that point, there should be no longer
> >> > user-space action. It means that the driver shouldn't get any more
> >> > data to finish the f/w loading upon that point. Thus the only
> >>
> >> For example, when one ethernet driver is requesting its firmware,
> >> the loader is forced to be killed before shutdown, so the driver can't set the
> >> WoL any more in its shutdown(), then users start to complain it is a
> >> regression.
> >
> > First off, this can't happen because, as mentioned earlier, the point
> > we're discussing is already the moment after user-space is supposed to
> > have been finished by init, i.e. there is already no udev. Thus the
>
> OK, sorry for missing the fact which user-space application may have
> been killed before calling sys_reboot(), even though the fact depends on
> implementation of user space 'reboot'.

Heh, the existence of f/w loading via user-space itself already
depends on the user-space behavior.


> > pending f/w request via usermode helper can never finish. So, it
> > makes no sense to wait for any user-space action at that point. It
> > just locks up.
> >
> > Secondly, if this would ever work, it's still just a luck. The
> > protection is only about the usermode helper lock in the f/w loading
> > code. This doesn't mean that the whole pending driver work would be
> > finished. In other words, such a driver design is just broken.
>
> A well-implemented driver should make sure the synchronization,
> looks it isn't related with firmware loading.

Right, and that's my point, too. Why the f/w loader needs to care
the completion of unprotected f/w loading?


> >> That is why I suggest you to only kill requests of FW_ACTION_NOHOTPLUG.
> >>
> >> Also it isn't good to affect all drivers just for fixing two insane drivers.
> >>
> >> > possible consequence is the timeout, which is equivalent with the
> >> > immediate abort of the operation.
> >> >
> >> > As mentioned earlier, the suspend behavior may be different. We want
> >> > to retry the f/w load. Ideally, the f/w loader should abort and
> >> > automatically retry after resume. In this case, also there is no big
> >>
> >> I don't think it is good idea since suspend() may need firmware to
> >> change the device's power state. Considered that only two insane
> >> drivers use FW_ACTION_NOHOTPLUG, we can kill the two before
> >> suspend just like the case of shutdown.
> >
> > The same argument can be applied. The protection in f/w loading
> > doesn't guarantee that the pending driver action will be finished
>
> The driver itself can make sure any synchronization it needed.(
> probe vs. suspend, or open vs. suspend), and nothing to do with
> request_firmware().
>
> > before suspend. It protects only the udev reaction. But, it doesn't
> > protect the action after it.
>
> Also suspend is different with shutdown since usermodehelper lock
> is required to be held before freezing processes, so firmware loading
> should be completed before suspending.

Yes, but the question is why it must not be aborted but completed.
That's my primary question.

(Though, the abort would require some automatic restart of f/w load
after resume, as long as we keep the current interface. So, this
would lead to more codes, and thus it's a clear drawback... If the
argument were in that way, I'd agree :) Just giving the time limit to
NOHOTPLUG case would ease the problems in practice. It's not perfect,
but maybe we should go for a better perfection, the removal of such
mess (userspace involvement) from the whole f/w loader code.)


> >> > reason to distinguish FW_ACTION_* types. Even for udev case, the
> >> > action can be easily retried.
> >> >
> >> > Or, a cleaner solution would be to get rid of FW_ACTION_NOHOTPLUG
> >> > completely. As Kay mentioned, this was a big mistake from the very
> >>
> >> It is still not a good idea, hackers may need FW_ACTION_NOHOTPLUG
> >> to debug its driver with private firmwares, or examples like dell's BIOS update.
> >
> > Oh no, it's a badly designed interface. It should be never used.
>
> Yes, it is bad, but killing FW_ACTION_NOHOTPLUG may break dell's BIOS
> update utility, also how can we meet the demand of drivers which need private
> firmwares?

Feeding the firmware data can be implemented pretty easily in a
different way, e.g. just by providing a misc driver interface with
read(). But, the biggest problem is the compatibility. That's why I
wrote "if allowed" in the earlier mail :)

Seriously, _if allowed_, we should go for this direction: killing
usermode f/w loading. Then the removal of NOHOTPLUG would be the
first step.


thanks,

Takashi

2013-05-13 15:04:21

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 1/3] firmware: Avoid superfluous usermodehelper lock

At Sun, 12 May 2013 21:59:51 +0800,
Ming Lei wrote:
>
> On Thu, May 9, 2013 at 1:51 AM, Takashi Iwai <[email protected]> wrote:
> >
> > ---
> > From: Takashi Iwai <[email protected]>
> > Subject: [PATCH v2] firmware: Avoid deadlock of usermodehelper lock at shutdown
> >
> > When a system goes to reboot/shutdown, it tries to disable the
> > usermode helper via usermodehelper_disable(). This might be blocked
> > when a driver tries to load a firmware beforehand and it's stuck by
> > some reason.
> >
> > In this patch, the firmware class driver registers a reboot notifier
> > so that it can abort all pending f/w bufs. Also enable a flag for
> > avoiding the call of usermodehelper after the reboot/shutdown starts.
> >
> > Signed-off-by: Takashi Iwai <[email protected]>
> > ---
> > drivers/base/firmware_class.c | 40 +++++++++++++++++++++++++++++++++-------
> > 1 file changed, 33 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
> > index 4b1f926..972e535 100644
> > --- a/drivers/base/firmware_class.c
> > +++ b/drivers/base/firmware_class.c
> > @@ -27,6 +27,7 @@
> > #include <linux/pm.h>
> > #include <linux/suspend.h>
> > #include <linux/syscore_ops.h>
> > +#include <linux/reboot.h>
> >
> > #include <generated/utsrelease.h>
> >
> > @@ -171,6 +172,7 @@ static struct firmware_buf *__allocate_fw_buf(const char *fw_name,
> > strcpy(buf->fw_id, fw_name);
> > buf->fwc = fwc;
> > init_completion(&buf->completion);
> > + INIT_LIST_HEAD(&buf->list);
>
> You should introduce one extra field(such as, 'list_abort') in
> 'struct firmware_buf' and the field of 'list' is for firmware caching now.

Yeah, the current code messes it up. Sorry for not resending the
right version. I thought it won't conflict but it seems it doesn't
work as is. (At least, list_add() would need to be replaced with
list_move()).

> Also, INIT_LIST_HEAD isn't needed here.

Well, I'd recommend to avoid such a mini optimization unless it's
really a hot path. It may bite you in the end...


thanks,

Takashi

2013-05-21 17:02:32

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

On Wed, May 08, 2013 at 08:56:34AM +0200, Takashi Iwai wrote:
> Hi,
>
> this is a series of patches for the issue we faced in the firmware
> loader code during debugging the problem with dell_rbu driver with
> 3.9 kernel.
>
> The original problem was that the shutdown gets stuck when DELL BIOS
> update is performed. This turned out to be a problem in the firmware
> loader. Although the reason of dell_rbu driver breakage is still
> unclear, we should fix the firmware loader side, at least, not to
> stall during shutdown.

What's the status of these? Ming, still have objections? Takashi,
still object to the objections?

Should I just take them and let you two continue to argue? :)

confused,

greg k-h

2013-05-22 01:21:28

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

On Wed, May 22, 2013 at 1:02 AM, Greg Kroah-Hartman
<[email protected]> wrote:
> On Wed, May 08, 2013 at 08:56:34AM +0200, Takashi Iwai wrote:
>> Hi,
>>
>> this is a series of patches for the issue we faced in the firmware
>> loader code during debugging the problem with dell_rbu driver with
>> 3.9 kernel.
>>
>> The original problem was that the shutdown gets stuck when DELL BIOS
>> update is performed. This turned out to be a problem in the firmware
>> loader. Although the reason of dell_rbu driver breakage is still
>> unclear, we should fix the firmware loader side, at least, not to
>> stall during shutdown.
>
> What's the status of these? Ming, still have objections?

I have no objections now, but Takashi need to submit v2 to fix one
mistake which may break current firmware cache.

Thanks,
--
Ming Lei

2013-05-22 16:24:48

by Takashi Iwai

[permalink] [raw]
Subject: Re: [PATCH 0/3] firmware: Fix usermodehelper deadlock at shutdown

At Wed, 22 May 2013 09:21:24 +0800,
Ming Lei wrote:
>
> On Wed, May 22, 2013 at 1:02 AM, Greg Kroah-Hartman
> <[email protected]> wrote:
> > On Wed, May 08, 2013 at 08:56:34AM +0200, Takashi Iwai wrote:
> >> Hi,
> >>
> >> this is a series of patches for the issue we faced in the firmware
> >> loader code during debugging the problem with dell_rbu driver with
> >> 3.9 kernel.
> >>
> >> The original problem was that the shutdown gets stuck when DELL BIOS
> >> update is performed. This turned out to be a problem in the firmware
> >> loader. Although the reason of dell_rbu driver breakage is still
> >> unclear, we should fix the firmware loader side, at least, not to
> >> stall during shutdown.
> >
> > What's the status of these? Ming, still have objections?
>
> I have no objections now, but Takashi need to submit v2 to fix one
> mistake which may break current firmware cache.

Sorry for the late response. I'm going to submit the revised patches
now.


Takashi