The intent is to avoid writing init code after init (because the text
might have been freed). The code is needlessly different between
jump_label and static_call and not obviously correct.
The existing code relies on the fact that the module loader clears the
init layout, such that within_module_init() always fails, while
jump_label relies on the module state which is more obvious and
matches the kernel logic.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/static_call.c | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -149,6 +149,7 @@ void __static_call_update(struct static_
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -168,6 +169,7 @@ void __static_call_update(struct static_
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -175,16 +177,8 @@ void __static_call_update(struct static_
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
WARN_ONCE(1, "can't patch static call site at %pS",
The following commit has been merged into the locking/urgent branch of tip:
Commit-ID: 698bacefe993ad2922c9d3b1380591ad489355e9
Gitweb: https://git.kernel.org/tip/698bacefe993ad2922c9d3b1380591ad489355e9
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 18 Mar 2021 11:29:56 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Fri, 19 Mar 2021 13:16:44 +01:00
static_call: Align static_call_is_init() patching condition
The intent is to avoid writing init code after init (because the text
might have been freed). The code is needlessly different between
jump_label and static_call and not obviously correct.
The existing code relies on the fact that the module loader clears the
init layout, such that within_module_init() always fails, while
jump_label relies on the module state which is more obvious and
matches the kernel logic.
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Jarkko Sakkinen <[email protected]>
Tested-by: Sumit Garg <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/static_call.c | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 080c8a9..fc22590 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -149,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -168,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -175,16 +177,8 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
WARN_ONCE(1, "can't patch static call site at %pS",
On 18/03/2021 12.31, Peter Zijlstra wrote:
> The intent is to avoid writing init code after init (because the text
> might have been freed). The code is needlessly different between
> jump_label and static_call and not obviously correct.
>
> The existing code relies on the fact that the module loader clears the
> init layout, such that within_module_init() always fails, while
> jump_label relies on the module state which is more obvious and
> matches the kernel logic.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/static_call.c | 14 ++++----------
> 1 file changed, 4 insertions(+), 10 deletions(-)
>
> --- a/kernel/static_call.c
> +++ b/kernel/static_call.c
> @@ -149,6 +149,7 @@ void __static_call_update(struct static_
> };
>
> for (site_mod = &first; site_mod; site_mod = site_mod->next) {
> + bool init = system_state < SYSTEM_RUNNING;
I recently had occasion to look at whether that would be a suitable
condition for knowing whether __init stuff was gone, but concluded that
it's not. Maybe I'm wrong. init/main.c:
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace
page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
So ISTM there's window where system_state < SYSTEM_RUNNING but accessing
init stuff is a bad idea. If you're PID1 it's all fine, but I suppose
other kernel threads could end up calling static_call_update. And just
moving the system_state setting up before the *free_initmem() calls
doesn't really solve anything because TOCTOU.
Dunno, probably overkill, but perhaps we could have an atomic_t (or
refcount, whatever) init_ref inited to 1, with init_ref_get() doing an
inc_unless_zero, and iff you get a ref, you're free to call (/patch)
__init functions and access __initdata, but must do init_ref_put(), with
PID1 dropping its initial ref and waiting for it to drop to 0 before
doing the *free_initmem() calls.
Rasmus
On Fri, Mar 19, 2021 at 02:31:19PM +0100, Rasmus Villemoes wrote:
> On 18/03/2021 12.31, Peter Zijlstra wrote:
> > The intent is to avoid writing init code after init (because the text
> > might have been freed). The code is needlessly different between
> > jump_label and static_call and not obviously correct.
> >
> > The existing code relies on the fact that the module loader clears the
> > init layout, such that within_module_init() always fails, while
> > jump_label relies on the module state which is more obvious and
> > matches the kernel logic.
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> > ---
> > kernel/static_call.c | 14 ++++----------
> > 1 file changed, 4 insertions(+), 10 deletions(-)
> >
> > --- a/kernel/static_call.c
> > +++ b/kernel/static_call.c
> > @@ -149,6 +149,7 @@ void __static_call_update(struct static_
> > };
> >
> > for (site_mod = &first; site_mod; site_mod = site_mod->next) {
> > + bool init = system_state < SYSTEM_RUNNING;
>
> I recently had occasion to look at whether that would be a suitable
> condition for knowing whether __init stuff was gone, but concluded that
> it's not. Maybe I'm wrong. init/main.c:
Ha, me too:
https://lkml.kernel.org/r/YFMToXI/[email protected]
and I share your concern.
> Dunno, probably overkill, but perhaps we could have an atomic_t (or
> refcount, whatever) init_ref inited to 1, with init_ref_get() doing an
> inc_unless_zero, and iff you get a ref, you're free to call (/patch)
> __init functions and access __initdata, but must do init_ref_put(), with
> PID1 dropping its initial ref and waiting for it to drop to 0 before
> doing the *free_initmem() calls.
I'd as soon simply add another SYSTEM state. That way we don't have to
worry about who else looks at RUNNING for what etc..
On 19/03/2021 15.13, Peter Zijlstra wrote:
>> Dunno, probably overkill, but perhaps we could have an atomic_t (or
>> refcount, whatever) init_ref inited to 1, with init_ref_get() doing an
>> inc_unless_zero, and iff you get a ref, you're free to call (/patch)
>> __init functions and access __initdata, but must do init_ref_put(), with
>> PID1 dropping its initial ref and waiting for it to drop to 0 before
>> doing the *free_initmem() calls.
>
> I'd as soon simply add another SYSTEM state. That way we don't have to
> worry about who else looks at RUNNING for what etc..
I don't understand. How would that solve the
PID1 PIDX
ok = system_state < INIT_MEM_GONE;
system_state = INIT_MEM_GONE;
free_initmem();
system_state = RUNNING;
if (ok)
poke init mem
race? I really don't see any way arbitrary threads can reliably check
how far PID1 has progressed at one point in time and use that
information (a few lines) later to access init memory without
synchronizing with PID1.
AFAICT, having an atomic_t object representing the init memory and a
"no, you can't have a new reference" would guarantee correctness of the
jump_label/static_call patching: If we get a reference, we do the
patching just as for the rest of the kernel .text. If we don't, i.e.
observe atomic_load(init_ref)==0, that may not necessarily mean that
PID1 has actually discarded the memory yet, but no thread can currently
or in the future actually run __init code, so it need not be patched.
Rasmus
On 19/03/2021 15.40, Rasmus Villemoes wrote:
> On 19/03/2021 15.13, Peter Zijlstra wrote:
>
>>> Dunno, probably overkill, but perhaps we could have an atomic_t (or
>>> refcount, whatever) init_ref inited to 1, with init_ref_get() doing an
>>> inc_unless_zero, and iff you get a ref, you're free to call (/patch)
>>> __init functions and access __initdata, but must do init_ref_put(), with
>>> PID1 dropping its initial ref and waiting for it to drop to 0 before
>>> doing the *free_initmem() calls.
>>
>> I'd as soon simply add another SYSTEM state. That way we don't have to
>> worry about who else looks at RUNNING for what etc..
>
> I don't understand. How would that solve the
>
> PID1 PIDX
> ok = system_state < INIT_MEM_GONE;
> system_state = INIT_MEM_GONE;
> free_initmem();
> system_state = RUNNING;
> if (ok)
> poke init mem
>
> race? I really don't see any way arbitrary threads can reliably check
> how far PID1 has progressed at one point in time and use that
> information (a few lines) later to access init memory without
> synchronizing with PID1.
>
> AFAICT, having an atomic_t object representing the init memory
something like
--- a/init/main.c
+++ b/init/main.c
@@ -1417,6 +1417,18 @@ void __weak free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
+static atomic_t init_mem_ref = ATOMIC_INIT(1);
+static DECLARE_COMPLETION(init_mem_may_go);
+bool init_mem_get(void)
+{
+ return atomic_inc_not_zero(&init_mem_ref);
+}
+void init_mem_put(void)
+{
+ if (atomic_dec_and_test(&init_mem_ref))
+ complete(&init_mem_may_go);
+}
+
static int __ref kernel_init(void *unused)
{
int ret;
@@ -1424,6 +1436,8 @@ static int __ref kernel_init(void *unused)
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
+ init_mem_put();
+ wait_for_completion(&init_mem_may_go);
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
On Fri, Mar 19, 2021 at 03:40:46PM +0100, Rasmus Villemoes wrote:
> On 19/03/2021 15.13, Peter Zijlstra wrote:
>
> >> Dunno, probably overkill, but perhaps we could have an atomic_t (or
> >> refcount, whatever) init_ref inited to 1, with init_ref_get() doing an
> >> inc_unless_zero, and iff you get a ref, you're free to call (/patch)
> >> __init functions and access __initdata, but must do init_ref_put(), with
> >> PID1 dropping its initial ref and waiting for it to drop to 0 before
> >> doing the *free_initmem() calls.
> >
> > I'd as soon simply add another SYSTEM state. That way we don't have to
> > worry about who else looks at RUNNING for what etc..
>
> I don't understand. How would that solve the
>
> PID1 PIDX
> ok = system_state < INIT_MEM_GONE;
> system_state = INIT_MEM_GONE;
> free_initmem();
> system_state = RUNNING;
> if (ok)
> poke init mem
>
> race?
Argh, I meant to put it before SMP bringup, but then still have to run
the smp_init calls :/
N/m, you're quite right.