2016-10-24 16:08:41

by Roman Pen

[permalink] [raw]
Subject: [PATCH v2 1/2] kthread: allocate kthread structure using kmalloc

This patch avoids allocation of kthread structure on a stack, and simply
uses kmalloc. Allocation on a stack became a huge problem (with memory
corruption and all other not nice consequences) after the commit 2deb4be28
by Andy Lutomirski, which rewinds the stack on oops, thus ooopsed kthread
steps on a garbage memory while completion of task->vfork_done structure
on the following path:

oops_end()
rewind_stack_do_exit()
exit_mm()
mm_release()
complete_vfork_done()

Also in this patch two structures 'struct kthread_create_info' and
'struct kthread' are merged into one 'struct kthread' and its freeing
is controlled by a reference counter.

The last reference on kthread is put from a task work, the callback,
which is invoked from do_exit(). The major thing is that the last
put is happens *after* completion_vfork_done() is invoked.

Signed-off-by: Roman Pen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
---
v2:
o let x86/kernel/dumpstack.c rewind a stack, but do not use a stack
for a structure allocation.

kernel/kthread.c | 160 +++++++++++++++++++++++++++++++------------------------
1 file changed, 90 insertions(+), 70 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ab4c37..9ccfe06 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -18,14 +18,19 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>

static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

-struct kthread_create_info
-{
+struct kthread {
+ struct list_head list;
+ unsigned long flags;
+ unsigned int cpu;
+ atomic_t refs;
+
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
@@ -33,15 +38,9 @@ struct kthread_create_info

/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
- struct completion *done;
-
- struct list_head list;
-};

-struct kthread {
- unsigned long flags;
- unsigned int cpu;
- void *data;
+ struct callback_head put_work;
+ struct completion *started;
struct completion parked;
struct completion exited;
};
@@ -69,6 +68,24 @@ static struct kthread *to_live_kthread(struct task_struct *k)
return NULL;
}

+static inline void put_kthread(struct kthread *kthread)
+{
+ if (atomic_dec_and_test(&kthread->refs))
+ kfree(kthread);
+}
+
+/**
+ * put_kthread_cb - is called from do_exit() and does likely
+ * the final put.
+ */
+static void put_kthread_cb(struct callback_head *work)
+{
+ struct kthread *kthread;
+
+ kthread = container_of(work, struct kthread, put_work);
+ put_kthread(kthread);
+}
+
/**
* kthread_should_stop - should this kthread return now?
*
@@ -174,41 +191,36 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);

-static int kthread(void *_create)
+static int kthreadfn(void *_self)
{
- /* Copy data: it's on kthread's stack */
- struct kthread_create_info *create = _create;
- int (*threadfn)(void *data) = create->threadfn;
- void *data = create->data;
- struct completion *done;
- struct kthread self;
- int ret;
-
- self.flags = 0;
- self.data = data;
- init_completion(&self.exited);
- init_completion(&self.parked);
- current->vfork_done = &self.exited;
-
- /* If user was SIGKILLed, I release the structure. */
- done = xchg(&create->done, NULL);
- if (!done) {
- kfree(create);
- do_exit(-EINTR);
+ struct completion *started;
+ struct kthread *self = _self;
+ int ret = -EINTR;
+
+ /* If user was SIGKILLed, put a ref and exit silently. */
+ started = xchg(&self->started, NULL);
+ if (!started) {
+ put_kthread(self);
+ goto exit;
}
+ /* Delegate last ref put to a task work, which will happen
+ * after 'vfork_done' completion.
+ */
+ init_task_work(&self->put_work, put_kthread_cb);
+ task_work_add(current, &self->put_work, false);
+ current->vfork_done = &self->exited;
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
- create->result = current;
- complete(done);
+ self->result = current;
+ complete(started);
schedule();

- ret = -EINTR;
-
- if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
- __kthread_parkme(&self);
- ret = threadfn(data);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+ __kthread_parkme(self);
+ ret = self->threadfn(self->data);
}
- /* we can't just return, we must preserve "self" on stack */
+exit:
do_exit(ret);
}

@@ -222,25 +234,24 @@ int tsk_fork_get_node(struct task_struct *tsk)
return NUMA_NO_NODE;
}

-static void create_kthread(struct kthread_create_info *create)
+static void create_kthread(struct kthread *kthread)
{
+ struct completion *started;
int pid;

#ifdef CONFIG_NUMA
- current->pref_node_fork = create->node;
+ current->pref_node_fork = kthread->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthreadfn, kthread,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
- /* If user was SIGKILLed, I release the structure. */
- struct completion *done = xchg(&create->done, NULL);
-
- if (!done) {
- kfree(create);
- return;
+ started = xchg(&kthread->started, NULL);
+ if (started) {
+ kthread->result = ERR_PTR(pid);
+ complete(started);
}
- create->result = ERR_PTR(pid);
- complete(done);
+ put_kthread(kthread);
}
}

@@ -272,20 +283,26 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[],
...)
{
- DECLARE_COMPLETION_ONSTACK(done);
+ DECLARE_COMPLETION_ONSTACK(started);
struct task_struct *task;
- struct kthread_create_info *create = kmalloc(sizeof(*create),
- GFP_KERNEL);
+ struct kthread *kthread;

- if (!create)
+ kthread = kmalloc(sizeof(*kthread), GFP_KERNEL);
+ if (!kthread)
return ERR_PTR(-ENOMEM);
- create->threadfn = threadfn;
- create->data = data;
- create->node = node;
- create->done = &done;
+ /* One ref for us and one ref for a new kernel thread. */
+ atomic_set(&kthread->refs, 2);
+ kthread->flags = 0;
+ kthread->cpu = 0;
+ kthread->threadfn = threadfn;
+ kthread->data = data;
+ kthread->node = node;
+ kthread->started = &started;
+ init_completion(&kthread->exited);
+ init_completion(&kthread->parked);

spin_lock(&kthread_create_lock);
- list_add_tail(&create->list, &kthread_create_list);
+ list_add_tail(&kthread->list, &kthread_create_list);
spin_unlock(&kthread_create_lock);

wake_up_process(kthreadd_task);
@@ -294,21 +311,23 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
* the OOM killer while kthreadd is trying to allocate memory for
* new kernel thread.
*/
- if (unlikely(wait_for_completion_killable(&done))) {
+ if (unlikely(wait_for_completion_killable(&started))) {
/*
* If I was SIGKILLed before kthreadd (or new kernel thread)
- * calls complete(), leave the cleanup of this structure to
- * that thread.
+ * calls complete(), put a ref and return an error.
*/
- if (xchg(&create->done, NULL))
+ if (xchg(&kthread->started, NULL)) {
+ put_kthread(kthread);
+
return ERR_PTR(-EINTR);
+ }
/*
* kthreadd (or new kernel thread) will call complete()
* shortly.
*/
- wait_for_completion(&done);
+ wait_for_completion(&started);
}
- task = create->result;
+ task = kthread->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
va_list args;
@@ -323,7 +342,8 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(task, cpu_all_mask);
}
- kfree(create);
+ put_kthread(kthread);
+
return task;
}
EXPORT_SYMBOL(kthread_create_on_node);
@@ -523,14 +543,14 @@ int kthreadd(void *unused)

spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
- struct kthread_create_info *create;
+ struct kthread *kthread;

- create = list_entry(kthread_create_list.next,
- struct kthread_create_info, list);
- list_del_init(&create->list);
+ kthread = list_entry(kthread_create_list.next,
+ struct kthread, list);
+ list_del_init(&kthread->list);
spin_unlock(&kthread_create_lock);

- create_kthread(create);
+ create_kthread(kthread);

spin_lock(&kthread_create_lock);
}
--
2.9.3


2016-10-24 16:08:44

by Roman Pen

[permalink] [raw]
Subject: [PATCH v2 2/2] workqueue: ignore dead tasks in a workqueue sleep hook

If panic_on_oops is not set and oops happens inside workqueue kthread,
kernel kills this kthread. Current patch fixes recursive GPF which
happens when wq_worker_sleeping() function unconditionally accesses
the NULL kthread->vfork_done ptr trhu kthread_data() -> to_kthread().

The stack is the following:

[<ffffffff81397f75>] dump_stack+0x68/0x93
[<ffffffff8106954b>] ? do_exit+0x7ab/0xc10
[<ffffffff8108fd73>] __schedule_bug+0x83/0xe0
[<ffffffff81716d5a>] __schedule+0x7ea/0xba0
[<ffffffff810c864f>] ? vprintk_default+0x1f/0x30
[<ffffffff8116a63c>] ? printk+0x48/0x50
[<ffffffff81717150>] schedule+0x40/0x90
[<ffffffff8106976a>] do_exit+0x9ca/0xc10
[<ffffffff810c8e3d>] ? kmsg_dump+0x11d/0x190
[<ffffffff810c8d37>] ? kmsg_dump+0x17/0x190
[<ffffffff81021ee9>] oops_end+0x99/0xd0
[<ffffffff81052da5>] no_context+0x185/0x3e0
[<ffffffff81053083>] __bad_area_nosemaphore+0x83/0x1c0
[<ffffffff810c820e>] ? vprintk_emit+0x25e/0x530
[<ffffffff810531d4>] bad_area_nosemaphore+0x14/0x20
[<ffffffff8105355c>] __do_page_fault+0xac/0x570
[<ffffffff810c66fe>] ? console_trylock+0x1e/0xe0
[<ffffffff81002036>] ? trace_hardirqs_off_thunk+0x1a/0x1c
[<ffffffff81053a2c>] do_page_fault+0xc/0x10
[<ffffffff8171f812>] page_fault+0x22/0x30
[<ffffffff81089bc3>] ? kthread_data+0x33/0x40
[<ffffffff8108427e>] ? wq_worker_sleeping+0xe/0x80
[<ffffffff817169eb>] __schedule+0x47b/0xba0
[<ffffffff81717150>] schedule+0x40/0x90
[<ffffffff8106957d>] do_exit+0x7dd/0xc10
[<ffffffff81021ee9>] oops_end+0x99/0xd0

kthread->vfork_done is zeroed out on the following path:

do_exit()
exit_mm()
mm_release()
complete_vfork_done()

In order to fix a bug dead tasks must be ignored.

Signed-off-by: Roman Pen <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
---
v2:
o put a task->state check directly into a wq_worker_sleeping() function
instead of changing the __schedule().

kernel/workqueue.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9dc7ac5..b19dcb6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -875,9 +875,31 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
*/
struct task_struct *wq_worker_sleeping(struct task_struct *task)
{
- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct worker *worker, *to_wakeup = NULL;
struct worker_pool *pool;

+
+ if (task->state == TASK_DEAD)
+ /* Here we try to catch the following path before
+ * accessing NULL kthread->vfork_done ptr thru
+ * kthread_data():
+ *
+ * oops_end()
+ * do_exit()
+ * schedule()
+ *
+ * If panic_on_oops is not set and oops happens on
+ * a workqueue execution path, thread will be killed.
+ * That is definitly sad, but not to make the situation
+ * even worse we have to ignore dead tasks in order not
+ * to step on zeroed out members (e.g. t->vfork_done is
+ * already NULL on that path, since we were called by
+ * do_exit())).
+ */
+ return NULL;
+
+ worker = kthread_data(task);
+
/*
* Rescuers, which may not have all the fields set up like normal
* workers, also reach here, let's not access anything before
--
2.9.3

2016-10-24 16:40:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] workqueue: ignore dead tasks in a workqueue sleep hook

On Mon, Oct 24, 2016 at 06:08:14PM +0200, Roman Pen wrote:

> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -875,9 +875,31 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
> */
> struct task_struct *wq_worker_sleeping(struct task_struct *task)
> {
> - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
> + struct worker *worker, *to_wakeup = NULL;
> struct worker_pool *pool;
>
> +
> + if (task->state == TASK_DEAD)
> + /* Here we try to catch the following path before
> + * accessing NULL kthread->vfork_done ptr thru
> + * kthread_data():
> + *
> + * oops_end()
> + * do_exit()
> + * schedule()
> + *
> + * If panic_on_oops is not set and oops happens on
> + * a workqueue execution path, thread will be killed.
> + * That is definitly sad, but not to make the situation
> + * even worse we have to ignore dead tasks in order not
> + * to step on zeroed out members (e.g. t->vfork_done is
> + * already NULL on that path, since we were called by
> + * do_exit())).
> + */
> + return NULL;

https://lkml.kernel.org/r/CA+55aFyQYJerovMsSoSKS7PessZBr4vNp-3QUUwhqk4A4_jcbg@mail.gmail.com

Also, that misses { }.

> +
> + worker = kthread_data(task);
> +
> /*
> * Rescuers, which may not have all the fields set up like normal
> * workers, also reach here, let's not access anything before
> --
> 2.9.3
>

2016-10-24 16:42:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] kthread: allocate kthread structure using kmalloc

On Mon, Oct 24, 2016 at 06:08:13PM +0200, Roman Pen wrote:
> This patch avoids allocation of kthread structure on a stack, and simply
> uses kmalloc. Allocation on a stack became a huge problem (with memory
> corruption and all other not nice consequences) after the commit 2deb4be28

2deb4be28077 ("x86/dumpstack: When OOPSing, rewind the stack before do_exit()")

Is the normal quoting style.

.gitconfig:

[core]
abbrev = 12
[alias]
one = show -s --pretty='format:%h (\"%s\")'

> by Andy Lutomirski, which rewinds the stack on oops, thus ooopsed kthread
> steps on a garbage memory while completion of task->vfork_done structure
> on the following path:
>
> oops_end()
> rewind_stack_do_exit()
> exit_mm()
> mm_release()
> complete_vfork_done()
>
> Also in this patch two structures 'struct kthread_create_info' and
> 'struct kthread' are merged into one 'struct kthread' and its freeing
> is controlled by a reference counter.
>
> The last reference on kthread is put from a task work, the callback,
> which is invoked from do_exit(). The major thing is that the last
> put is happens *after* completion_vfork_done() is invoked.
>

2016-10-24 17:08:43

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] kthread: allocate kthread structure using kmalloc

On Mon, Oct 24, 2016 at 9:08 AM, Roman Pen
<[email protected]> wrote:
> This patch avoids allocation of kthread structure on a stack, and simply
> uses kmalloc. Allocation on a stack became a huge problem (with memory
> corruption and all other not nice consequences) after the commit 2deb4be28
> by Andy Lutomirski, which rewinds the stack on oops, thus ooopsed kthread
> steps on a garbage memory while completion of task->vfork_done structure
> on the following path:

This is IMO a *huge* improvement.

Shouldn't the patch also remove the try_get_task_stack() /
put_task_stack() hackery in kthread.c, though?

2016-10-24 19:10:10

by Roman Pen

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] workqueue: ignore dead tasks in a workqueue sleep hook

On Mon, Oct 24, 2016 at 6:40 PM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Oct 24, 2016 at 06:08:14PM +0200, Roman Pen wrote:
>
>> --- a/kernel/workqueue.c
>> +++ b/kernel/workqueue.c
>> @@ -875,9 +875,31 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
>> */
>> struct task_struct *wq_worker_sleeping(struct task_struct *task)
>> {
>> - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
>> + struct worker *worker, *to_wakeup = NULL;
>> struct worker_pool *pool;
>>
>> +
>> + if (task->state == TASK_DEAD)
>> + /* Here we try to catch the following path before
>> + * accessing NULL kthread->vfork_done ptr thru
>> + * kthread_data():
>> + *
>> + * oops_end()
>> + * do_exit()
>> + * schedule()
>> + *
>> + * If panic_on_oops is not set and oops happens on
>> + * a workqueue execution path, thread will be killed.
>> + * That is definitly sad, but not to make the situation
>> + * even worse we have to ignore dead tasks in order not
>> + * to step on zeroed out members (e.g. t->vfork_done is
>> + * already NULL on that path, since we were called by
>> + * do_exit())).
>> + */
>> + return NULL;
>
> https://lkml.kernel.org/r/CA+55aFyQYJerovMsSoSKS7PessZBr4vNp-3QUUwhqk4A4_jcbg@mail.gmail.com

Ha, explicit comment from Linus :) Ok.

> Also, that misses { }.

Ok.

--
Roman

2016-10-24 19:10:31

by Roman Pen

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] kthread: allocate kthread structure using kmalloc

On Mon, Oct 24, 2016 at 6:42 PM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Oct 24, 2016 at 06:08:13PM +0200, Roman Pen wrote:
>> This patch avoids allocation of kthread structure on a stack, and simply
>> uses kmalloc. Allocation on a stack became a huge problem (with memory
>> corruption and all other not nice consequences) after the commit 2deb4be28
>
> 2deb4be28077 ("x86/dumpstack: When OOPSing, rewind the stack before do_exit()")
>
> Is the normal quoting style.
>
> .gitconfig:
>
> [core]
> abbrev = 12
> [alias]
> one = show -s --pretty='format:%h (\"%s\")'

Nice tip. Thanks.

--
Roman

2016-10-24 19:37:57

by Roman Pen

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] kthread: allocate kthread structure using kmalloc

On Mon, Oct 24, 2016 at 7:08 PM, Andy Lutomirski <[email protected]> wrote:
> On Mon, Oct 24, 2016 at 9:08 AM, Roman Pen
> <[email protected]> wrote:
>> This patch avoids allocation of kthread structure on a stack, and simply
>> uses kmalloc. Allocation on a stack became a huge problem (with memory
>> corruption and all other not nice consequences) after the commit 2deb4be28
>> by Andy Lutomirski, which rewinds the stack on oops, thus ooopsed kthread
>> steps on a garbage memory while completion of task->vfork_done structure
>> on the following path:
>
> This is IMO a *huge* improvement.
>
> Shouldn't the patch also remove the try_get_task_stack() /
> put_task_stack() hackery in kthread.c, though?

Do you mean that commit from Oleg https://patchwork.kernel.org/patch/9335295/ ?

Hm, I missed that to_live_kthread() function completely. So, yes, we can remove
put/get_task_stack(), but only replacing on get/put_kthread. So still need to
be careful with the refs.

Oleg, could you please take a look on the hunk below, just a quick thought.
(get_kthread is simple atomic kthread->refs++).

--
Roman

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -64,9 +64,20 @@ static inline struct kthread *to_kthread(struct
task_struct *k)
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork) && try_get_task_stack(k))
- return __to_kthread(vfork);
- return NULL;
+ struct kthread *kthread = NULL;
+
+ BUG_ON(!(k->flags & PF_KTHREAD));
+ if (likely(vfork)) {
+ task_lock(k);
+ vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork)) {
+ kthread = __to_kthread(vfork);
+ get_kthread(kthread);
+ }
+ task_unlock(k);
+ }
+
+ return kthread;
}

2016-10-24 20:27:32

by Tejun Heo

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] workqueue: ignore dead tasks in a workqueue sleep hook

Hello,

On Mon, Oct 24, 2016 at 06:08:14PM +0200, Roman Pen wrote:
> If panic_on_oops is not set and oops happens inside workqueue kthread,
> kernel kills this kthread. Current patch fixes recursive GPF which
> happens when wq_worker_sleeping() function unconditionally accesses
> the NULL kthread->vfork_done ptr trhu kthread_data() -> to_kthread().

Other than the pointed out style issues, looks good to me. If you
send an updated version, I'll route it through wq/for-4.9-fixes.

Thanks.

--
tejun