2015-04-13 14:16:56

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v5 10/10] module: Rework module_addr_{min,max}

__module_address() does an initial bound check before doing the
{list/tree} iteration to find the actual module. The bound variables
are nowhere near the mod_tree cacheline, in fact they're nowhere near
one another.

module_addr_min lives in .data while module_addr_max lives in .bss
(smarty pants GCC thinks the explicit 0 assignment is a mistake).

Rectify this by moving the two variables into a structure together
with the latch_tree_root to guarantee they all share the same
cacheline and avoid hitting two extra cachelines for the lookup.

While reworking the bounds code, move the bound update from allocation
to insertion time, this avoids updating the bounds for a few error
paths.

Cc: Rusty Russell <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/module.c | 84 ++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 54 insertions(+), 30 deletions(-)

--- a/kernel/module.c
+++ b/kernel/module.c
@@ -171,7 +171,26 @@ static const struct latch_tree_ops mod_t
.comp = mod_tree_comp,
};

-static struct latch_tree_root mod_tree __cacheline_aligned;
+static struct mod_tree_root {
+ struct latch_tree_root root;
+ unsigned long addr_min;
+ unsigned long addr_max;
+} mod_tree __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+#define module_addr_min mod_tree.addr_min
+#define module_addr_max mod_tree.addr_max
+
+static noinline void __mod_tree_insert(struct mod_tree_node *node)
+{
+ latch_tree_insert(&node->node, &mod_tree.root, &mod_tree_ops);
+}
+
+static void __mod_tree_remove(struct mod_tree_node *node)
+{
+ latch_tree_erase(&node->node, &mod_tree.root, &mod_tree_ops);
+}

/*
* These modifications: insert, remove_init and remove; are serialized by the
@@ -182,20 +201,20 @@ static void mod_tree_insert(struct modul
mod->mtn_core.mod = mod;
mod->mtn_init.mod = mod;

- latch_tree_insert(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+ __mod_tree_insert(&mod->mtn_core);
if (mod->init_size)
- latch_tree_insert(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+ __mod_tree_insert(&mod->mtn_init);
}

static void mod_tree_remove_init(struct module *mod)
{
if (mod->init_size)
- latch_tree_erase(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+ __mod_tree_remove(&mod->mtn_init);
}

static void mod_tree_remove(struct module *mod)
{
- latch_tree_erase(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+ __mod_tree_remove(&mod->mtn_core);
mod_tree_remove_init(mod);
}

@@ -203,7 +222,7 @@ static struct module *mod_find(unsigned
{
struct latch_tree_node *ltn;

- ltn = latch_tree_find((void *)addr, &mod_tree, &mod_tree_ops);
+ ltn = latch_tree_find((void *)addr, &mod_tree.root, &mod_tree_ops);
if (!ltn)
return NULL;

@@ -212,6 +231,12 @@ static struct module *mod_find(unsigned

#else /* !(PERF_EVENTS || TRACING) */

+/*
+ * Bounds of module allocation, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
static void mod_tree_insert(struct module *mod) { }
static void mod_tree_remove_init(struct module *mod) { }
static void mod_tree_remove(struct module *mod) { }
@@ -230,6 +255,24 @@ static struct module *mod_find(unsigned

#endif /* !(PERF_EVENTS || TRACING) */

+static void __mod_update_bounds(void *base, unsigned int size)
+{
+ unsigned long min = (unsigned long)base;
+ unsigned long max = min + size;
+
+ if (min < module_addr_min)
+ module_addr_min = min;
+ if (max > module_addr_max)
+ module_addr_max = max;
+}
+
+static void mod_update_bounds(struct module *mod)
+{
+ __mod_update_bounds(mod->module_core, mod->core_size);
+ if (mod->init_size)
+ __mod_update_bounds(mod->module_init, mod->init_size);
+}
+
#ifdef CONFIG_KGDB_KDB
struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
#endif /* CONFIG_KGDB_KDB */
@@ -300,10 +343,6 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq

static BLOCKING_NOTIFIER_HEAD(module_notify_list);

-/* Bounds of module allocation, for speeding __module_address.
- * Protected by module_mutex. */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
-
int register_module_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -2542,22 +2581,6 @@ void * __weak module_alloc(unsigned long
return vmalloc_exec(size);
}

-static void *module_alloc_update_bounds(unsigned long size)
-{
- void *ret = module_alloc(size);
-
- if (ret) {
- mutex_lock(&module_mutex);
- /* Update module bounds. */
- if ((unsigned long)ret < module_addr_min)
- module_addr_min = (unsigned long)ret;
- if ((unsigned long)ret + size > module_addr_max)
- module_addr_max = (unsigned long)ret + size;
- mutex_unlock(&module_mutex);
- }
- return ret;
-}
-
#ifdef CONFIG_DEBUG_KMEMLEAK
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
@@ -2942,7 +2965,7 @@ static int move_module(struct module *mo
void *ptr;

/* Do the allocs. */
- ptr = module_alloc_update_bounds(mod->core_size);
+ ptr = module_alloc(mod->core_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. Just mark it as not being a
@@ -2956,7 +2979,7 @@ static int move_module(struct module *mo
mod->module_core = ptr;

if (mod->init_size) {
- ptr = module_alloc_update_bounds(mod->init_size);
+ ptr = module_alloc(mod->init_size);
/*
* The pointer to this block is stored in the module structure
* which is inside the block. This block doesn't need to be
@@ -3327,6 +3350,7 @@ static int add_unformed_module(struct mo
goto out;
}
list_add_rcu(&mod->list, &modules);
+ mod_update_bounds(mod);
mod_tree_insert(mod);
err = 0;

@@ -3967,11 +3991,11 @@ struct module *__module_address(unsigned
{
struct module *mod;

+ module_assert_mutex_or_preempt();
+
if (addr < module_addr_min || addr > module_addr_max)
return NULL;

- module_assert_mutex_or_preempt();
-
mod = mod_find(addr);
if (mod) {
BUG_ON(!within_module(addr, mod));


2015-04-13 16:56:44

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v5 10/10] module: Rework module_addr_{min,max}


* Peter Zijlstra <[email protected]> wrote:

> __module_address() does an initial bound check before doing the
> {list/tree} iteration to find the actual module. The bound variables
> are nowhere near the mod_tree cacheline, in fact they're nowhere
> near one another.
>
> module_addr_min lives in .data while module_addr_max lives in .bss
> (smarty pants GCC thinks the explicit 0 assignment is a mistake).
>
> Rectify this by moving the two variables into a structure together
> with the latch_tree_root to guarantee they all share the same
> cacheline and avoid hitting two extra cachelines for the lookup.
>
> While reworking the bounds code, move the bound update from
> allocation to insertion time, this avoids updating the bounds for a
> few error paths.

> +static struct mod_tree_root {
> + struct latch_tree_root root;
> + unsigned long addr_min;
> + unsigned long addr_max;
> +} mod_tree __cacheline_aligned = {
> + .addr_min = -1UL,
> +};
> +
> +#define module_addr_min mod_tree.addr_min
> +#define module_addr_max mod_tree.addr_max

> #else /* !(PERF_EVENTS || TRACING) */
>
> +/*
> + * Bounds of module allocation, for speeding up __module_address.
> + * Protected by module_mutex.
> + */
> +static unsigned long module_addr_min = -1UL, module_addr_max = 0;

I suspect the same .data vs. .bss problem affects the #else branch as
well?

If so then it would make sense IMHO to put the structure definition
into generic code so that both variants benefit from the shared
cacheline?

Thanks,

Ingo

2015-04-14 02:57:49

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH v5 10/10] module: Rework module_addr_{min,max}

Ingo Molnar <[email protected]> writes:
> * Peter Zijlstra <[email protected]> wrote:
>
>> __module_address() does an initial bound check before doing the
>> {list/tree} iteration to find the actual module. The bound variables
>> are nowhere near the mod_tree cacheline, in fact they're nowhere
>> near one another.
>>
>> module_addr_min lives in .data while module_addr_max lives in .bss
>> (smarty pants GCC thinks the explicit 0 assignment is a mistake).
>>
>> Rectify this by moving the two variables into a structure together
>> with the latch_tree_root to guarantee they all share the same
>> cacheline and avoid hitting two extra cachelines for the lookup.
>>
>> While reworking the bounds code, move the bound update from
>> allocation to insertion time, this avoids updating the bounds for a
>> few error paths.
>
>> +static struct mod_tree_root {
>> + struct latch_tree_root root;
>> + unsigned long addr_min;
>> + unsigned long addr_max;
>> +} mod_tree __cacheline_aligned = {
>> + .addr_min = -1UL,
>> +};
>> +
>> +#define module_addr_min mod_tree.addr_min
>> +#define module_addr_max mod_tree.addr_max

Nice catch.

Does the min/max comparison still win us anything? (I'm guessing yes...)

In general, I'm happy with this series. Assume you want another
go-round for Ingo's tweaks, then I'll take them for 4.2.

Thanks,
Rusty.

2015-04-14 06:43:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v5 10/10] module: Rework module_addr_{min,max}

On Tue, Apr 14, 2015 at 12:25:45PM +0930, Rusty Russell wrote:
> Ingo Molnar <[email protected]> writes:
> > * Peter Zijlstra <[email protected]> wrote:
> >
> >> __module_address() does an initial bound check before doing the
> >> {list/tree} iteration to find the actual module. The bound variables
> >> are nowhere near the mod_tree cacheline, in fact they're nowhere
> >> near one another.
> >>
> >> module_addr_min lives in .data while module_addr_max lives in .bss
> >> (smarty pants GCC thinks the explicit 0 assignment is a mistake).
> >>
> >> Rectify this by moving the two variables into a structure together
> >> with the latch_tree_root to guarantee they all share the same
> >> cacheline and avoid hitting two extra cachelines for the lookup.
> >>
> >> While reworking the bounds code, move the bound update from
> >> allocation to insertion time, this avoids updating the bounds for a
> >> few error paths.
> >
> >> +static struct mod_tree_root {
> >> + struct latch_tree_root root;
> >> + unsigned long addr_min;
> >> + unsigned long addr_max;
> >> +} mod_tree __cacheline_aligned = {
> >> + .addr_min = -1UL,
> >> +};
> >> +
> >> +#define module_addr_min mod_tree.addr_min
> >> +#define module_addr_max mod_tree.addr_max
>
> Nice catch.
>
> Does the min/max comparison still win us anything? (I'm guessing yes...)

Yep, while a tree iteration is much faster than the linear thing it is
still quite a bit slower than two simple compares.

> In general, I'm happy with this series. Assume you want another
> go-round for Ingo's tweaks, then I'll take them for 4.2.

Thanks!

2015-04-14 12:57:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v5 10/10] module: Rework module_addr_{min,max}

On Mon, Apr 13, 2015 at 06:56:36PM +0200, Ingo Molnar wrote:
> > + * Bounds of module allocation, for speeding up __module_address.
> > + * Protected by module_mutex.
> > + */
> > +static unsigned long module_addr_min = -1UL, module_addr_max = 0;
>
> I suspect the same .data vs. .bss problem affects the #else branch as
> well?

Yes, but the linear walk already has a 'problem', other than the linear
walk itself being one, the list_head isn't actually on the same line as
the 'key' entries -- although I suppose I could fix that for the
!CONFIG_MODULES_TREE_LOOKUP case.

> If so then it would make sense IMHO to put the structure definition
> into generic code so that both variants benefit from the shared
> cacheline?

Isn't this optimizing hopeless code? I mean, I can make the change;
something like the below. Although I suppose we should use
____cacheline_aligned here and just take the false sharing.

--- a/kernel/module.c
+++ b/kernel/module.c
@@ -230,11 +230,15 @@ static struct module *mod_find(unsigned

#else /* MODULES_TREE_LOOKUP */

-/*
- * Bounds of module allocation, for speeding up __module_address.
- * Protected by module_mutex.
- */
-static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+static struct mod_bounds {
+ unsigned long addr_min;
+ unsigned long addr_max;
+} mod_bounds __cacheline_aligned = {
+ .addr_min = -1UL,
+};
+
+#define module_addr_min mod_bounds.addr_min
+#define module_addr_max mod_bounds.addr_max

static void mod_tree_insert(struct module *mod) { }
static void mod_tree_remove_init(struct module *mod) { }
@@ -254,6 +258,10 @@ static struct module *mod_find(unsigned

#endif /* MODULES_TREE_LOOKUP */

+/*
+ * Bounds of module text, for speeding up __module_address.
+ * Protected by module_mutex.
+ */
static void __mod_update_bounds(void *base, unsigned int size)
{
unsigned long min = (unsigned long)base;
@@ -3363,8 +3371,8 @@ static int add_unformed_module(struct mo
err = -EEXIST;
goto out;
}
- list_add_rcu(&mod->list, &modules);
mod_update_bounds(mod);
+ list_add_rcu(&mod->list, &modules);
mod_tree_insert(mod);
err = 0;

2015-04-14 13:00:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v5 10/10] module: Rework module_addr_{min,max}


* Peter Zijlstra <[email protected]> wrote:

> On Mon, Apr 13, 2015 at 06:56:36PM +0200, Ingo Molnar wrote:
> > > + * Bounds of module allocation, for speeding up __module_address.
> > > + * Protected by module_mutex.
> > > + */
> > > +static unsigned long module_addr_min = -1UL, module_addr_max = 0;
> >
> > I suspect the same .data vs. .bss problem affects the #else branch as
> > well?
>
> Yes, but the linear walk already has a 'problem', other than the linear
> walk itself being one, the list_head isn't actually on the same line as
> the 'key' entries -- although I suppose I could fix that for the
> !CONFIG_MODULES_TREE_LOOKUP case.
>
> > If so then it would make sense IMHO to put the structure definition
> > into generic code so that both variants benefit from the shared
> > cacheline?
>
> Isn't this optimizing hopeless code? I mean, I can make the change;
> something like the below. Although I suppose we should use
> ____cacheline_aligned here and just take the false sharing.

Well, I think the point is to share more code and move the two
variants closer to each other without hurting either side - not to
optimize the slower side necessarily.

But I have no strong opinions either way!

Thanks,

Ingo