[Problem]
efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
in a write callback. If a kernel panic happens in interrupt contexts, pstore may
fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
[Patch Description]
This patch removes sysfs operations from write callback by introducing a workqueue
updating sysfs entries which is kicked after a write callback of efi_pstore is called.
efi_pstore will be robust against a kernel panic in an interrupt context with it.
The workqueue works as follows.
- A timer is registered during an initialization of efivars module.
- A flag, update_sysfs_entries, is checked periodically with the timer.
- The flag is set in a write callback, efi_pstore_write().
- Operation updating sysfs entries is kicked if the flag is set.
Any operations ,like registering a timer, should not be added in a write callback because it may run
in panic case and fail due to operations related to workqueue.
To make efi_pstore work in panic case, a write callback should just log to NVRAM.
Signed-off-by: Seiji Aguchi <[email protected]>
---
drivers/firmware/efivars.c | 127 ++++++++++++++++++++++++++++++++++++++++----
include/linux/efi.h | 3 +-
2 files changed, 118 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index b6a813e..9f27ef6 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -146,6 +146,17 @@ efivar_create_sysfs_entry(struct efivars *efivars,
efi_char16_t *variable_name,
efi_guid_t *vendor_guid);
+/*
+ * Prototype for workqueue functions updating sysfs entry
+ */
+
+static void efivar_update_sysfs_entry(struct work_struct *);
+#define EFIVAR_INTERVAL (60 * HZ)
+static void efivar_timefunc(unsigned long);
+static DEFINE_TIMER(efivar_timer, efivar_timefunc, 0, 0);
+static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
+static int update_sysfs_entries;
+
/* Return the number of unicode characters in data */
static unsigned long
utf16_strnlen(efi_char16_t *s, size_t maxlength)
@@ -739,9 +750,6 @@ static int efi_pstore_write(enum pstore_type_id type,
0, NULL);
}
- if (found)
- list_del(&found->list);
-
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = name[i];
@@ -750,14 +758,7 @@ static int efi_pstore_write(enum pstore_type_id type,
spin_unlock_irqrestore(&efivars->lock, flags);
- if (found)
- efivar_unregister(found);
-
- if (size)
- ret = efivar_create_sysfs_entry(efivars,
- utf16_strsize(efi_name,
- DUMP_NAME_LEN * 2),
- efi_name, &vendor);
+ update_sysfs_entries = 1;
*id = part;
return ret;
@@ -1201,6 +1202,8 @@ int register_efivars(struct efivars *efivars,
pstore_register(&efivars->efi_pstore_info);
}
+ add_timer(&efivar_timer);
+
out:
kfree(variable_name);
@@ -1211,6 +1214,107 @@ EXPORT_SYMBOL_GPL(register_efivars);
static struct efivars __efivars;
static struct efivar_operations ops;
+static void delete_sysfs_entry(void)
+{
+ struct efivars *efivars = &__efivars;
+ struct efivar_entry *entry, *n;
+ efi_status_t status;
+ unsigned long flags;
+
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ spin_lock_irqsave(&efivars->lock, flags);
+ status = get_var_data_locked(efivars, &entry->var);
+ if (status != EFI_SUCCESS) {
+ list_del(&entry->list);
+ spin_unlock_irqrestore(&efivars->lock, flags);
+ efivar_unregister(entry);
+ } else
+ spin_unlock_irqrestore(&efivars->lock, flags);
+ }
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+{
+ struct efivar_entry *entry, *n;
+ struct efivars *efivars = &__efivars;
+ unsigned long strsize1, strsize2;
+ bool found = false;
+
+ strsize1 = utf16_strsize(variable_name, 1024);
+ list_for_each_entry_safe(entry, n, &efivars->list, list) {
+ strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+ if (strsize1 == strsize2 &&
+ !memcmp(variable_name, &(entry->var.VariableName),
+ strsize2) &&
+ !efi_guidcmp(entry->var.VendorGuid,
+ *vendor)) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
+static void efivar_update_sysfs_entry(struct work_struct *work)
+{
+ struct efivars *efivars = &__efivars;
+ efi_guid_t vendor;
+ efi_char16_t *variable_name;
+ unsigned long variable_name_size = 1024, flags;
+ efi_status_t status = EFI_NOT_FOUND;
+ bool found;
+
+ /* Delete unavailable sysfs entries */
+ delete_sysfs_entry();
+
+ /* Add new sysfs entries */
+ while (1) {
+ variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+ if (!variable_name) {
+ pr_err("efivars: Memory allocation failed.\n");
+ return;
+ }
+
+ spin_lock_irqsave(&efivars->lock, flags);
+ found = false;
+ while (1) {
+ variable_name_size = 1024;
+ status = efivars->ops->get_next_variable(
+ &variable_name_size,
+ variable_name,
+ &vendor);
+ if (status != EFI_SUCCESS) {
+ break;
+ } else {
+ if (!variable_is_present(variable_name,
+ &vendor)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&efivars->lock, flags);
+
+ if (!found) {
+ kfree(variable_name);
+ break;
+ } else
+ efivar_create_sysfs_entry(efivars,
+ variable_name_size,
+ variable_name, &vendor);
+ }
+}
+
+static void efivar_timefunc(unsigned long dummy)
+{
+ if (update_sysfs_entries) {
+ update_sysfs_entries = 0;
+ schedule_work(&efivar_work);
+ }
+
+ mod_timer(&efivar_timer, jiffies + EFIVAR_INTERVAL);
+}
+
/*
* For now we register the efi subsystem with the firmware subsystem
* and the vars subsystem with the efi subsystem. In the future, it
@@ -1266,6 +1370,7 @@ static void __exit
efivars_exit(void)
{
if (efi_enabled) {
+ del_timer_sync(&efivar_timer);
unregister_efivars(&__efivars);
kobject_put(efi_kobj);
}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index f1f5703..3e0c37a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -641,7 +641,8 @@ struct efivars {
* 1) ->list - adds, removals, reads, writes
* 2) ops.[gs]et_variable() calls.
* It must not be held when creating sysfs entries or calling kmalloc.
- * ops.get_next_variable() is only called from register_efivars(),
+ * ops.get_next_variable() is only called from register_efivars()
+ * or efivar_update_sysfs_entry(),
* which is protected by the BKL, so that path is safe.
*/
spinlock_t lock;
-- 1.7.1
On Fri, Aug 17, 2012 at 12:43 PM, Seiji Aguchi <[email protected]> wrote:
> [Problem]
> efi_pstore creates sysfs entries ,which enable users to access to NVRAM,
> in a write callback. If a kernel panic happens in interrupt contexts, pstore may
> fail because it could sleep due to dynamic memory allocations during creating sysfs entries.
>
> [Patch Description]
> This patch removes sysfs operations from write callback by introducing a workqueue
> updating sysfs entries which is kicked after a write callback of efi_pstore is called.
> efi_pstore will be robust against a kernel panic in an interrupt context with it.
> The workqueue works as follows.
>
> - A timer is registered during an initialization of efivars module.
> - A flag, update_sysfs_entries, is checked periodically with the timer.
> - The flag is set in a write callback, efi_pstore_write().
> - Operation updating sysfs entries is kicked if the flag is set.
>
> Any operations ,like registering a timer, should not be added in a write callback because it may run
> in panic case and fail due to operations related to workqueue.
I'm not a fan of creating a periodic timer that wakes up here to check
for an event that should be considered very rare.
Can this just become scheduled work? Scheduling work itself is a very
lightweight process and should be relatively safe to do from a pstore
write.
>
> To make efi_pstore work in panic case, a write callback should just log to NVRAM.
>
> Signed-off-by: Seiji Aguchi <[email protected]>
> ---
> drivers/firmware/efivars.c | 127 ++++++++++++++++++++++++++++++++++++++++----
> include/linux/efi.h | 3 +-
> 2 files changed, 118 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
> index b6a813e..9f27ef6 100644
> --- a/drivers/firmware/efivars.c
> +++ b/drivers/firmware/efivars.c
> @@ -146,6 +146,17 @@ efivar_create_sysfs_entry(struct efivars *efivars,
> efi_char16_t *variable_name,
> efi_guid_t *vendor_guid);
>
> +/*
> + * Prototype for workqueue functions updating sysfs entry
> + */
> +
> +static void efivar_update_sysfs_entry(struct work_struct *);
> +#define EFIVAR_INTERVAL (60 * HZ)
> +static void efivar_timefunc(unsigned long);
> +static DEFINE_TIMER(efivar_timer, efivar_timefunc, 0, 0);
> +static DECLARE_WORK(efivar_work, efivar_update_sysfs_entry);
> +static int update_sysfs_entries;
> +
> /* Return the number of unicode characters in data */
> static unsigned long
> utf16_strnlen(efi_char16_t *s, size_t maxlength)
> @@ -739,9 +750,6 @@ static int efi_pstore_write(enum pstore_type_id type,
> 0, NULL);
> }
>
> - if (found)
> - list_del(&found->list);
> -
> for (i = 0; i < DUMP_NAME_LEN; i++)
> efi_name[i] = name[i];
>
> @@ -750,14 +758,7 @@ static int efi_pstore_write(enum pstore_type_id type,
>
> spin_unlock_irqrestore(&efivars->lock, flags);
>
> - if (found)
> - efivar_unregister(found);
> -
> - if (size)
> - ret = efivar_create_sysfs_entry(efivars,
> - utf16_strsize(efi_name,
> - DUMP_NAME_LEN * 2),
> - efi_name, &vendor);
> + update_sysfs_entries = 1;
>
> *id = part;
> return ret;
> @@ -1201,6 +1202,8 @@ int register_efivars(struct efivars *efivars,
> pstore_register(&efivars->efi_pstore_info);
> }
>
> + add_timer(&efivar_timer);
> +
> out:
> kfree(variable_name);
>
> @@ -1211,6 +1214,107 @@ EXPORT_SYMBOL_GPL(register_efivars);
> static struct efivars __efivars;
> static struct efivar_operations ops;
>
> +static void delete_sysfs_entry(void)
> +{
> + struct efivars *efivars = &__efivars;
> + struct efivar_entry *entry, *n;
> + efi_status_t status;
> + unsigned long flags;
> +
> + list_for_each_entry_safe(entry, n, &efivars->list, list) {
> + spin_lock_irqsave(&efivars->lock, flags);
> + status = get_var_data_locked(efivars, &entry->var);
> + if (status != EFI_SUCCESS) {
> + list_del(&entry->list);
> + spin_unlock_irqrestore(&efivars->lock, flags);
> + efivar_unregister(entry);
> + } else
> + spin_unlock_irqrestore(&efivars->lock, flags);
> + }
> +}
> +
> +static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
> +{
> + struct efivar_entry *entry, *n;
> + struct efivars *efivars = &__efivars;
> + unsigned long strsize1, strsize2;
> + bool found = false;
> +
> + strsize1 = utf16_strsize(variable_name, 1024);
> + list_for_each_entry_safe(entry, n, &efivars->list, list) {
> + strsize2 = utf16_strsize(entry->var.VariableName, 1024);
> + if (strsize1 == strsize2 &&
> + !memcmp(variable_name, &(entry->var.VariableName),
> + strsize2) &&
> + !efi_guidcmp(entry->var.VendorGuid,
> + *vendor)) {
> + found = true;
> + break;
> + }
> + }
> + return found;
> +}
> +
> +static void efivar_update_sysfs_entry(struct work_struct *work)
> +{
> + struct efivars *efivars = &__efivars;
> + efi_guid_t vendor;
> + efi_char16_t *variable_name;
> + unsigned long variable_name_size = 1024, flags;
> + efi_status_t status = EFI_NOT_FOUND;
> + bool found;
> +
> + /* Delete unavailable sysfs entries */
> + delete_sysfs_entry();
> +
> + /* Add new sysfs entries */
> + while (1) {
> + variable_name = kzalloc(variable_name_size, GFP_KERNEL);
> + if (!variable_name) {
> + pr_err("efivars: Memory allocation failed.\n");
> + return;
> + }
> +
> + spin_lock_irqsave(&efivars->lock, flags);
> + found = false;
> + while (1) {
> + variable_name_size = 1024;
> + status = efivars->ops->get_next_variable(
> + &variable_name_size,
> + variable_name,
> + &vendor);
> + if (status != EFI_SUCCESS) {
> + break;
> + } else {
> + if (!variable_is_present(variable_name,
> + &vendor)) {
> + found = true;
> + break;
> + }
> + }
> + }
> + spin_unlock_irqrestore(&efivars->lock, flags);
> +
> + if (!found) {
> + kfree(variable_name);
> + break;
> + } else
> + efivar_create_sysfs_entry(efivars,
> + variable_name_size,
> + variable_name, &vendor);
> + }
> +}
> +
> +static void efivar_timefunc(unsigned long dummy)
> +{
> + if (update_sysfs_entries) {
> + update_sysfs_entries = 0;
> + schedule_work(&efivar_work);
> + }
> +
> + mod_timer(&efivar_timer, jiffies + EFIVAR_INTERVAL);
> +}
> +
> /*
> * For now we register the efi subsystem with the firmware subsystem
> * and the vars subsystem with the efi subsystem. In the future, it
> @@ -1266,6 +1370,7 @@ static void __exit
> efivars_exit(void)
> {
> if (efi_enabled) {
> + del_timer_sync(&efivar_timer);
> unregister_efivars(&__efivars);
> kobject_put(efi_kobj);
> }
> diff --git a/include/linux/efi.h b/include/linux/efi.h
> index f1f5703..3e0c37a 100644
> --- a/include/linux/efi.h
> +++ b/include/linux/efi.h
> @@ -641,7 +641,8 @@ struct efivars {
> * 1) ->list - adds, removals, reads, writes
> * 2) ops.[gs]et_variable() calls.
> * It must not be held when creating sysfs entries or calling kmalloc.
> - * ops.get_next_variable() is only called from register_efivars(),
> + * ops.get_next_variable() is only called from register_efivars()
> + * or efivar_update_sysfs_entry(),
> * which is protected by the BKL, so that path is safe.
> */
> spinlock_t lock;
> -- 1.7.1
> I'm not a fan of creating a periodic timer that wakes up here to check for an event that should be considered very rare.
>
> Can this just become scheduled work? Scheduling work itself is a very lightweight process and should be relatively safe to do from a
> pstore write.
I agree that the periodic timer is heavy a bit.
But I would like to keep a write callback simple as much as possible in panic situation.
For example, I'm concerned that efi_pstore hangs up due to some spin_locks related workqueue like gcwq->lock.
Also, a situation which this workqueue is needed is just oops case because, system will be down and users can't access to sysfs files
in other cases, panic, reboot and emergency_restart.
So, Can I call schedule_work in oops case only as follows?
efi_pstore_write()
{
<write log to NVRAM>
if (reason != KMSG_DUMP_OOPS)
return;
schedule_work()
}
On Fri, Aug 17, 2012 at 2:15 PM, Seiji Aguchi <[email protected]> wrote:
>
>> I'm not a fan of creating a periodic timer that wakes up here to check for an event that should be considered very rare.
>>
>> Can this just become scheduled work? Scheduling work itself is a very lightweight process and should be relatively safe to do from a
>> pstore write.
>
> I agree that the periodic timer is heavy a bit.
> But I would like to keep a write callback simple as much as possible in panic situation.
> For example, I'm concerned that efi_pstore hangs up due to some spin_locks related workqueue like gcwq->lock.
>
> Also, a situation which this workqueue is needed is just oops case because, system will be down and users can't access to sysfs files
> in other cases, panic, reboot and emergency_restart.
>
> So, Can I call schedule_work in oops case only as follows?
>
> efi_pstore_write()
> {
> <write log to NVRAM>
>
> if (reason != KMSG_DUMP_OOPS)
> return;
>
> schedule_work()
You could, but why not always just schedule_work()? If we are hosed
by broken workqueue/scheduler locking, the user isn't going to see
those files in sysfs either way :) If you are going to insist that we
shouldn't schedule_work() in the other cases, I'd prefer:
/* The user may want to see an entry for this write in sysfs. */
if (reason == KMSG_DUMP_OOPS)
schedule_work(...);
>
> }
> You could, but why not always just schedule_work()? If we are hosed by broken workqueue/scheduler locking, the user isn't going to
> see those files in sysfs either way :)
I'm not concern about failure of sysfs operations.
In panic function call, panic_notifier_chain is kicked. Also users may expect the system reboots in panic case
by specifying /proc/sys/kernel/panic.
I just would like to avoid failures of those operations without adding unnecessary calls in the write callback.
> If you are going to insist that we shouldn't schedule_work() in the other cases, I'd prefer:
>
> /* The user may want to see an entry for this write in sysfs. */ if (reason == KMSG_DUMP_OOPS)
> schedule_work(...);
>
Thank you for your suggestion. I will update my patch by adding this logic above.