Subject: [PATCH v2 0/5] module: Remove stop_machine from module unloading

Hi,

Here is the second version of removing stop_machine() from
module unloading patchset.

Currently, each module unloading calls stop_machine()s 2 times.
One is for safely removing module from lists and one is to
check the reference counter. However, both are not necessary
for those purposes (required by current implementation).

First, we can use RCU for the list operation, we just need
a synchronize_rcu right before cleaning up.
Second, the reference counter can be checked atomically by
using atomic_t, instead of per-cpu module_ref.

In v2, the series updated as below;

- Fix to initialize return value in 1/5.
- Remove module LOCKUP code(v1 4/5).
- Split v1 5/5 into replacing module_ref with atomic_t and
removing stop_machine patches.
- Also, add BUG_ON and WARN_ON for the cases if the
reference counter becomes a negative value.

Thank you,


---

Masami Hiramatsu (5):
module: Wait for RCU synchronizing before releasing a module
module: Unlink module with RCU synchronizing instead of stop_machine
lib/bug: Use RCU list ops for module_bug_list
module: Replace module_ref with atomic_t refcnt
module: Remove stop_machine from module unloading


include/linux/module.h | 16 ------
include/trace/events/module.h | 2 -
kernel/module.c | 117 ++++++++++++++++-------------------------
lib/bug.c | 20 +++++--
4 files changed, 63 insertions(+), 92 deletions(-)

--
Masami HIRAMATSU
Software Platform Research Dpt. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: [email protected]


Subject: [PATCH v2 1/5] module: Wait for RCU synchronizing before releasing a module

Wait for RCU synchronizing on failure path of module loading
before releasing struct module, because the memory of mod->list
can still be accessed by list walkers (e.g. kallsyms).

Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/module.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 88cec1d..331b03f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3326,6 +3326,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
module_deallocate(mod, info);

Subject: [PATCH v2 2/5] module: Unlink module with RCU synchronizing instead of stop_machine

Unlink module from module list with RCU synchronizing instead
of using stop_machine(). Since module list is already protected
by rcu, we don't need stop_machine() anymore.

Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/module.c | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 331b03f..bed608b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1697,18 +1697,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}

-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1860,7 +1848,11 @@ static void free_module(struct module *mod)

/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ module_bug_cleanup(mod);
mutex_unlock(&module_mutex);

/* This may be NULL, but that's OK */

Subject: [PATCH v2 3/5] lib/bug: Use RCU list ops for module_bug_list

Actually since module_bug_list should be used in BUG context,
we may not need this. But for someone who want to use this
from normal context, this makes module_bug_list an RCU list.

Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/module.c | 5 +++--
lib/bug.c | 20 ++++++++++++++------
2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index bed608b..d596a30 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1850,9 +1850,10 @@ static void free_module(struct module *mod)
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
- /* Wait for RCU synchronizing before releasing mod->list. */
- synchronize_rcu();
+ /* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);

/* This may be NULL, but that's OK */
diff --git a/lib/bug.c b/lib/bug.c
index d1d7c78..0c3bd95 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -64,16 +64,22 @@ static LIST_HEAD(module_bug_list);
static const struct bug_entry *module_find_bug(unsigned long bugaddr)
{
struct module *mod;
+ const struct bug_entry *bug = NULL;

- list_for_each_entry(mod, &module_bug_list, bug_list) {
- const struct bug_entry *bug = mod->bug_table;
+ rcu_read_lock();
+ list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
unsigned i;

+ bug = mod->bug_table;
for (i = 0; i < mod->num_bugs; ++i, ++bug)
if (bugaddr == bug_addr(bug))
- return bug;
+ goto out;
}
- return NULL;
+ bug = NULL;
+out:
+ rcu_read_unlock();
+
+ return bug;
}

void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
@@ -99,13 +105,15 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
* Strictly speaking this should have a spinlock to protect against
* traversals, but since we only traverse on BUG()s, a spinlock
* could potentially lead to deadlock and thus be counter-productive.
+ * Thus, this uses RCU to safely manipulate the bug list, since BUG
+ * must run in non-interruptive state.
*/
- list_add(&mod->bug_list, &module_bug_list);
+ list_add_rcu(&mod->bug_list, &module_bug_list);
}

void module_bug_cleanup(struct module *mod)
{
- list_del(&mod->bug_list);
+ list_del_rcu(&mod->bug_list);
}

#else

Subject: [PATCH v2 4/5] module: Replace module_ref with atomic_t refcnt

Replace module_ref per-cpu complex reference counter with
an atomic_t simple refcnt. This is for code simplification.

Signed-off-by: Masami Hiramatsu <[email protected]>
---
include/linux/module.h | 16 +---------------
include/trace/events/module.h | 2 +-
kernel/module.c | 39 +++++----------------------------------
3 files changed, 7 insertions(+), 50 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 71f282a..ebfb0e1 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -210,20 +210,6 @@ enum module_state {
MODULE_STATE_UNFORMED, /* Still setting it up. */
};

-/**
- * struct module_ref - per cpu module reference counts
- * @incs: number of module get on this cpu
- * @decs: number of module put on this cpu
- *
- * We force an alignment on 8 or 16 bytes, so that alloc_percpu()
- * put @incs/@decs in same cache line, with no extra memory cost,
- * since alloc_percpu() is fine grained.
- */
-struct module_ref {
- unsigned long incs;
- unsigned long decs;
-} __attribute((aligned(2 * sizeof(unsigned long))));
-
struct module {
enum module_state state;

@@ -367,7 +353,7 @@ struct module {
/* Destruction function. */
void (*exit)(void);

- struct module_ref __percpu *refptr;
+ atomic_t refcnt;
#endif

#ifdef CONFIG_CONSTRUCTORS
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 7c5cbfe..81c4c18 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(module_refcnt,

TP_fast_assign(
__entry->ip = ip;
- __entry->refcnt = __this_cpu_read(mod->refptr->incs) - __this_cpu_read(mod->refptr->decs);
+ __entry->refcnt = atomic_read(&mod->refcnt);
__assign_str(name, mod->name);
),

diff --git a/kernel/module.c b/kernel/module.c
index d596a30..b1d485d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -631,15 +631,11 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
-
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);

/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_set(&mod->refcnt, 1);

return 0;
}
@@ -721,8 +717,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}

#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -772,28 +766,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)

unsigned long module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return (unsigned long)atomic_read(&mod->refcnt);
}
EXPORT_SYMBOL(module_refcount);

@@ -935,7 +908,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -950,7 +923,7 @@ bool try_module_get(struct module *module)
preempt_disable();

if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
} else
ret = false;
@@ -965,9 +938,7 @@ void module_put(struct module *module)
{
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ atomic_dec(&module->refcnt);
trace_module_put(module, _RET_IP_);
preempt_enable();
}

Subject: [PATCH v2 5/5] module: Remove stop_machine from module unloading

Remove stop_machine from module unloading by adding new reference
counting algorithm.

This atomic refcounter works like a semaphore, it can get (be
incremented) only when the counter is not 0. When loading a module,
kmodule subsystem sets the counter MODULE_REF_BASE (= 1). And when
unloading the module, it subtracts MODULE_REF_BASE from the counter.
If no one refers the module, the refcounter becomes 0 and we can
remove the module safely. If someone referes it, we try to recover
the counter by adding MODULE_REF_BASE unless the counter becomes 0,
because the referrer can put the module right before recovering.
If the recovering is failed, we can get the 0 refcount and it
never be incremented again, it can be removed safely too.

Note that __module_get() forcibly gets the module refcounter,
users should use try_module_get() instead of that.

Signed-off-by: Masami Hiramatsu <[email protected]>
---
kernel/module.c | 67 ++++++++++++++++++++++++++++++++-----------------------
1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index b1d485d..e772595 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -98,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -628,14 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];

EXPORT_TRACEPOINT_SYMBOL(module_get);

+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);

/* Hold reference count during initialization. */
- atomic_set(&mod->refcnt, 1);
+ atomic_inc(&mod->refcnt);

return 0;
}
@@ -734,39 +742,39 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */

-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;

-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}

+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}

/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
+ mod->state = MODULE_STATE_GOING;

- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}

unsigned long module_refcount(struct module *mod)
{
- return (unsigned long)atomic_read(&mod->refcnt);
+ return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);

@@ -921,11 +929,11 @@ bool try_module_get(struct module *module)

if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- atomic_inc(&module->refcnt);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;

preempt_enable();
@@ -936,9 +944,12 @@ EXPORT_SYMBOL(try_module_get);

void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- atomic_dec(&module->refcnt);
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}

2014-10-28 05:56:07

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH v2 5/5] module: Remove stop_machine from module unloading

Masami Hiramatsu <[email protected]> writes:
> Remove stop_machine from module unloading by adding new reference
> counting algorithm.

Thankyou!

I have applied these to my tree, and they will enter Linus' next merge
window.

Thanks!
Rusty.
PS. I created stop_machine for module unloading and CPU hotplug. Maybe
we'll get to remove it altogether soon :)