2008-11-24 15:40:54

by Andrey Mirkin

[permalink] [raw]
Subject: [PATCH 0/2] In-kernel process restart

These patchset introduces OpenVZ kernel based restart procedure on top of
Oren's checkpoint/restart patchset v9.

For restarting a set of processes one will need to call sys_restart() once with
new flag CR_CTX_RSTR_IN_KERNEL. All work will be done in kernel in this case.

Small changes to image format are required to make in-kernel process creation
more easy.

Oren, please take a look on this patchset. I've tried to port OpenVZ
functionality on top of yours with minimal changes.


2008-11-24 15:40:27

by Andrey Mirkin

[permalink] [raw]
Subject: [PATCH 2/2] Add support for in-kernel process creation during restart

All work (process tree creation and process state restore) now can be
done in kernel.

Task structure in image file is extended with 2 fields to make in-kernel
process creation more easy.

Signed-off-by: Andrey Mirkin <[email protected]>
---
checkpoint/checkpoint.c | 17 ++++
checkpoint/restart.c | 4 +-
checkpoint/rstr_process.c | 201 +++++++++++++++++++++++++++++++++++++++-
include/linux/checkpoint.h | 2 +
include/linux/checkpoint_hdr.h | 2 +
5 files changed, 223 insertions(+), 3 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index 04b0c4a..ae3326e 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -173,6 +173,21 @@ static int cr_write_tail(struct cr_ctx *ctx)
return ret;
}

+static int cr_count_children(struct cr_ctx *ctx, struct task_struct *tsk)
+{
+ int num = 0;
+ struct task_struct *child;
+
+ read_lock(&tasklist_lock);
+ list_for_each_entry(child, &tsk->children, sibling) {
+ if (child->parent != tsk)
+ continue;
+ num++;
+ }
+ read_unlock(&tasklist_lock);
+ return num;
+}
+
/* dump the task_struct of a given task */
static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
{
@@ -189,6 +204,8 @@ static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
hh->exit_code = t->exit_code;
hh->exit_signal = t->exit_signal;

+ hh->vpid = task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns);
+ hh->children_nr = cr_count_children(ctx, t);
hh->task_comm_len = TASK_COMM_LEN;

/* FIXME: save remaining relevant task_struct fields */
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 9259622..9f668f1 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -118,7 +118,7 @@ struct file *cr_read_open_fname(struct cr_ctx *ctx, int flags, int mode)
}

/* read the checkpoint header */
-static int cr_read_head(struct cr_ctx *ctx)
+int cr_read_head(struct cr_ctx *ctx)
{
struct cr_hdr_head *hh = cr_hbuf_get(ctx, sizeof(*hh));
int parent, ret = -EINVAL;
@@ -150,7 +150,7 @@ static int cr_read_head(struct cr_ctx *ctx)
}

/* read the checkpoint trailer */
-static int cr_read_tail(struct cr_ctx *ctx)
+int cr_read_tail(struct cr_ctx *ctx)
{
struct cr_hdr_tail *hh = cr_hbuf_get(ctx, sizeof(*hh));
int parent, ret = -EINVAL;
diff --git a/checkpoint/rstr_process.c b/checkpoint/rstr_process.c
index ec9e51b..c34378f 100644
--- a/checkpoint/rstr_process.c
+++ b/checkpoint/rstr_process.c
@@ -12,9 +12,208 @@
*
*/

+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/file.h>
+#include <linux/magic.h>
#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+#include "checkpoint_arch.h"
+
+struct thr_context {
+ struct completion complete;
+ int error;
+ struct cr_ctx *ctx;
+ struct cr_hdr_task *ht;
+};
+
+static int cr_restart_process(struct cr_ctx *ctx);
+
+static int cr_kernel_thread(int (*fn)(void *), void * arg,
+ unsigned long flags, pid_t pid)
+{
+ if (current->fs == NULL) {
+ /* do_fork() hates processes without fs, oopses. */
+ cr_debug("local_kernel_thread: current->fs==NULL\n");
+ return -EINVAL;
+ }
+ return kernel_thread(fn, arg, flags);
+}
+
+static int cr_rstr_task_struct(struct cr_ctx *ctx, struct cr_hdr_task *ht)
+{
+ struct task_struct *t = current;
+ char *buf;
+ int ret = -EINVAL;
+
+ /* upper limit for task_comm_len to prevent DoS */
+ if (ht->task_comm_len < 0 || ht->task_comm_len > PAGE_SIZE)
+ goto out;
+
+ buf = kmalloc(ht->task_comm_len, GFP_KERNEL);
+ if (!buf)
+ goto out;
+ ret = cr_read_string(ctx, buf, ht->task_comm_len);
+ if (!ret) {
+ /* if t->comm is too long, silently truncate */
+ memset(t->comm, 0, TASK_COMM_LEN);
+ memcpy(t->comm, buf, min(ht->task_comm_len, TASK_COMM_LEN));
+ }
+ kfree(buf);
+
+ /* FIXME: restore remaining relevant task_struct fields */
+out:
+ return ret;
+}
+static int restart_thread(void *arg)
+{
+ struct thr_context *thr_ctx = arg;
+ struct cr_ctx *ctx;
+ struct cr_hdr_task *ht;
+ int ret;
+ int i;
+
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ ctx = thr_ctx->ctx;
+ ht = thr_ctx->ht;
+
+ if (ht->vpid == 1) {
+ ctx->root_task = current;
+ ctx->root_nsproxy = current->nsproxy;
+
+ get_task_struct(ctx->root_task);
+ get_nsproxy(ctx->root_nsproxy);
+ }
+
+ ret = cr_rstr_task_struct(ctx, ht);
+ cr_debug("rstr_task_struct: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_mm(ctx);
+ cr_debug("memory: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_files(ctx);
+ cr_debug("files: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_thread(ctx);
+ cr_debug("thread: ret %d\n", ret);
+ if (ret < 0)
+ goto out;
+ ret = cr_read_cpu(ctx);
+ cr_debug("cpu: ret %d\n", ret);
+
+ for (i = 0; i < ht->children_nr; i++) {
+ ret = cr_restart_process(ctx);
+ if (ret < 0)
+ break;
+ }
+
+out:
+ thr_ctx->error = ret;
+ complete(&thr_ctx->complete);
+
+ if (!ret && (ht->state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ do_exit(ht->exit_code);
+ } else {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ schedule();
+
+ cr_debug("leaked %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
+
+ complete_and_exit(NULL, 0);
+ return ret;
+}
+
+static int cr_restart_process(struct cr_ctx *ctx)
+{
+ struct thr_context thr_ctx;
+ struct task_struct *tsk;
+ struct cr_hdr_task *ht = cr_hbuf_get(ctx, sizeof(*ht));
+ int pid, parent, ret = -EINVAL;
+
+ thr_ctx.ctx = ctx;
+ thr_ctx.error = 0;
+ init_completion(&thr_ctx.complete);
+
+ parent = cr_read_obj_type(ctx, ht, sizeof(*ht), CR_HDR_TASK);
+ if (parent < 0) {
+ ret = parent;
+ goto out;
+ } else if (parent != 0)
+ goto out;
+
+ thr_ctx.ht = ht;
+
+ if (ht->vpid == 1) {
+ /* We should also create container here */
+ pid = cr_kernel_thread(restart_thread, &thr_ctx,
+ CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET, 0);
+ } else {
+ /* We should fork here a child with saved pid and
+ correct flags */
+ pid = cr_kernel_thread(restart_thread, &thr_ctx, 0, ht->vpid);
+ }
+ if (pid < 0) {
+ ret = pid;
+ goto out;
+ }
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (tsk == NULL) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ wait_for_completion(&thr_ctx.complete);
+ wait_task_inactive(tsk, 0);
+ ret = thr_ctx.error;
+ put_task_struct(tsk);
+
+out:
+ cr_hbuf_put(ctx, sizeof(*ht));
+ return ret;
+}
+

int do_restart_in_kernel(struct cr_ctx *ctx)
{
- return -ENOSYS;
+ int ret, size, parent;
+ struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
+
+ ret = cr_read_head(ctx);
+ if (ret < 0)
+ goto out;
+
+ ret = -EINVAL;
+ parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
+ if (parent < 0) {
+ ret = parent;
+ goto out;
+ } else if (parent != 0)
+ goto out;
+
+ size = sizeof(*ctx->pids_arr) * hh->tasks_nr;
+ if (size < 0)
+ goto out;
+ ctx->file->f_pos += size;
+
+ ret = cr_restart_process(ctx);
+ if (ret < 0)
+ goto out;
+
+ ret = cr_read_tail(ctx);
+
+out:
+ cr_hbuf_put(ctx, sizeof(*hh));
+ return ret;
}
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 947469a..7a189ac 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -109,10 +109,12 @@ extern int do_checkpoint(struct cr_ctx *ctx, pid_t pid);
extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
extern int cr_write_files(struct cr_ctx *ctx, struct task_struct *t);

+extern int cr_read_head(struct cr_ctx *ctx);
extern int do_restart(struct cr_ctx *ctx, pid_t pid);
extern int do_restart_in_kernel(struct cr_ctx *ctx);
extern int cr_read_mm(struct cr_ctx *ctx);
extern int cr_read_files(struct cr_ctx *ctx);
+extern int cr_read_tail(struct cr_ctx *ctx);

#define cr_debug(fmt, args...) \
pr_debug("[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__, ## args)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 5114bdd..3d11254 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -88,6 +88,8 @@ struct cr_hdr_task {
__u32 exit_code;
__u32 exit_signal;

+ __u32 vpid;
+ __u32 children_nr;
__s32 task_comm_len;
} __attribute__((aligned(8)));

--
1.5.6

2008-11-24 15:40:40

by Andrey Mirkin

[permalink] [raw]
Subject: [PATCH 1/2] Add flags for user-space and in-kernel process creation

Introduce 2 flags for user-space and in-kernel process creation during
restart procedure.
Also a stub function for in-kernel process restart is introduced.

Signed-off-by: Andrey Mirkin <[email protected]>
---
checkpoint/Makefile | 2 +-
checkpoint/restart.c | 4 +++-
checkpoint/rstr_process.c | 20 ++++++++++++++++++++
checkpoint/sys.c | 4 ++--
include/linux/checkpoint.h | 3 +++
5 files changed, 29 insertions(+), 4 deletions(-)
create mode 100644 checkpoint/rstr_process.c

diff --git a/checkpoint/Makefile b/checkpoint/Makefile
index 88bbc10..7ead290 100644
--- a/checkpoint/Makefile
+++ b/checkpoint/Makefile
@@ -3,4 +3,4 @@
#

obj-$(CONFIG_CHECKPOINT_RESTART) += sys.o checkpoint.o restart.o objhash.o \
- ckpt_mem.o rstr_mem.o ckpt_file.o rstr_file.o
+ ckpt_mem.o rstr_mem.o ckpt_file.o rstr_file.o rstr_process.o
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 5360c40..9259622 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -461,7 +461,9 @@ int do_restart(struct cr_ctx *ctx, pid_t pid)
{
int ret;

- if (ctx)
+ if (ctx && (ctx->flags & CR_CTX_RSTR_IN_KERNEL))
+ ret = do_restart_in_kernel(ctx);
+ else if (ctx)
ret = do_restart_root(ctx, pid);
else
ret = do_restart_task(ctx, pid);
diff --git a/checkpoint/rstr_process.c b/checkpoint/rstr_process.c
new file mode 100644
index 0000000..ec9e51b
--- /dev/null
+++ b/checkpoint/rstr_process.c
@@ -0,0 +1,20 @@
+/*
+ * In-kernel process creation
+ *
+ * Copyright (C) 2008 Parallels, Inc.
+ *
+ * Author: Andrey Mirkin <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/checkpoint.h>
+
+int do_restart_in_kernel(struct cr_ctx *ctx)
+{
+ return -ENOSYS;
+}
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 7745500..e4a9287 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -264,8 +264,8 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
pid_t pid;
int ret;

- /* no flags for now */
- if (flags)
+ if ((flags & (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE)) ==
+ (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE))
return -EINVAL;

/* FIXME: for now, we use 'crid' as a pid */
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index cab5e19..947469a 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -61,6 +61,8 @@ struct cr_ctx {
/* cr_ctx: flags */
#define CR_CTX_CKPT 0x1
#define CR_CTX_RSTR 0x2
+#define CR_CTX_RSTR_IN_USERSPACE 0x4
+#define CR_CTX_RSTR_IN_KERNEL 0x8

extern int cr_kwrite(struct cr_ctx *ctx, void *buf, int count);
extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
@@ -108,6 +110,7 @@ extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
extern int cr_write_files(struct cr_ctx *ctx, struct task_struct *t);

extern int do_restart(struct cr_ctx *ctx, pid_t pid);
+extern int do_restart_in_kernel(struct cr_ctx *ctx);
extern int cr_read_mm(struct cr_ctx *ctx);
extern int cr_read_files(struct cr_ctx *ctx);

--
1.5.6

2008-11-24 16:04:40

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add flags for user-space and in-kernel process creation

On 24/11/08 18:39 +0300, Andrey Mirkin wrote:
> Introduce 2 flags for user-space and in-kernel process creation during
> restart procedure.
> Also a stub function for in-kernel process restart is introduced.
>

[...]

> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 7745500..e4a9287 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -264,8 +264,8 @@ asmlinkage long sys_restart(int crid, int fd, unsigned long flags)
> pid_t pid;
> int ret;
>
> - /* no flags for now */
> - if (flags)
> + if ((flags & (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE)) ==
> + (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE))

I guess that the intent was:
+ if ((flags & (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_KERNEL)) ==
+ (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_KERNEL))

?

Louis

> return -EINVAL;
>
> /* FIXME: for now, we use 'crid' as a pid */
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index cab5e19..947469a 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -61,6 +61,8 @@ struct cr_ctx {
> /* cr_ctx: flags */
> #define CR_CTX_CKPT 0x1
> #define CR_CTX_RSTR 0x2
> +#define CR_CTX_RSTR_IN_USERSPACE 0x4
> +#define CR_CTX_RSTR_IN_KERNEL 0x8
>
> extern int cr_kwrite(struct cr_ctx *ctx, void *buf, int count);
> extern int cr_kread(struct cr_ctx *ctx, void *buf, int count);
> @@ -108,6 +110,7 @@ extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t);
> extern int cr_write_files(struct cr_ctx *ctx, struct task_struct *t);
>
> extern int do_restart(struct cr_ctx *ctx, pid_t pid);
> +extern int do_restart_in_kernel(struct cr_ctx *ctx);
> extern int cr_read_mm(struct cr_ctx *ctx);
> extern int cr_read_files(struct cr_ctx *ctx);
>
> --
> 1.5.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (2.17 kB)
signature.asc (189.00 B)
Digital signature
Download all attachments

2008-11-25 00:41:27

by Alexey Dobriyan

[permalink] [raw]
Subject: Re: [PATCH 2/2] Add support for in-kernel process creation during restart

On Mon, Nov 24, 2008 at 06:39:35PM +0300, Andrey Mirkin wrote:
> +static int cr_rstr_task_struct(struct cr_ctx *ctx, struct cr_hdr_task *ht)
> +{
> + struct task_struct *t = current;
> + char *buf;
> + int ret = -EINVAL;
> +
> + /* upper limit for task_comm_len to prevent DoS */
> + if (ht->task_comm_len < 0 || ht->task_comm_len > PAGE_SIZE)
> + goto out;
> +
> + buf = kmalloc(ht->task_comm_len, GFP_KERNEL);
> + if (!buf)
> + goto out;
> + ret = cr_read_string(ctx, buf, ht->task_comm_len);
> + if (!ret) {
> + /* if t->comm is too long, silently truncate */
> + memset(t->comm, 0, TASK_COMM_LEN);
> + memcpy(t->comm, buf, min(ht->task_comm_len, TASK_COMM_LEN));
> + }
> + kfree(buf);
> +
> + /* FIXME: restore remaining relevant task_struct fields */
> +out:
> + return ret;
> +}

->comm is only 16 bytes wide, you can just use on-stack variable.

2008-11-25 20:02:46

by Oren Laadan

[permalink] [raw]
Subject: Re: [PATCH 0/2] In-kernel process restart


Andrey Mirkin wrote:
> These patchset introduces OpenVZ kernel based restart procedure on top of
> Oren's checkpoint/restart patchset v9.
>
> For restarting a set of processes one will need to call sys_restart() once with
> new flag CR_CTX_RSTR_IN_KERNEL. All work will be done in kernel in this case.
>
> Small changes to image format are required to make in-kernel process creation
> more easy.
>
> Oren, please take a look on this patchset. I've tried to port OpenVZ
> functionality on top of yours with minimal changes.
>

Thanks, Andrey. The patch looks simple and good.

Did you look at my (user-space) process creation code ? specifically, it
uses the task-pids array (that is saved during checkpoint) to figure out
how many children a task has to spawn and their corresponding pids.

Is there a particular reason that you chose not to use that data during
restart ? eventually, you'll need to add the rest of it, too, to the task
data, like you did with the number of children in this patch.

Instead, since the data is available and the logic is available, we can
have each new thread call do_restart_task(), and the container init
should call do_restart_root().

What do you think ? I can go ahead and make the changes on top of your
patch.

Oren.

2008-11-25 20:18:30

by Oren Laadan

[permalink] [raw]
Subject: Re: [PATCH 2/2] Add support for in-kernel process creation during restart

Hi,

Andrey Mirkin wrote:
> All work (process tree creation and process state restore) now can be
> done in kernel.
>
> Task structure in image file is extended with 2 fields to make in-kernel
> process creation more easy.
>
> Signed-off-by: Andrey Mirkin <[email protected]>
> ---
> checkpoint/checkpoint.c | 17 ++++
> checkpoint/restart.c | 4 +-
> checkpoint/rstr_process.c | 201 +++++++++++++++++++++++++++++++++++++++-
> include/linux/checkpoint.h | 2 +
> include/linux/checkpoint_hdr.h | 2 +
> 5 files changed, 223 insertions(+), 3 deletions(-)
>
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index 04b0c4a..ae3326e 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -173,6 +173,21 @@ static int cr_write_tail(struct cr_ctx *ctx)
> return ret;
> }
>
> +static int cr_count_children(struct cr_ctx *ctx, struct task_struct *tsk)
> +{
> + int num = 0;
> + struct task_struct *child;
> +
> + read_lock(&tasklist_lock);
> + list_for_each_entry(child, &tsk->children, sibling) {
> + if (child->parent != tsk)
> + continue;
> + num++;
> + }
> + read_unlock(&tasklist_lock);
> + return num;
> +}
> +

This information (and the pids of children) is already available in
the ctx->pids_arr.

> /* dump the task_struct of a given task */
> static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
> {
> @@ -189,6 +204,8 @@ static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t)
> hh->exit_code = t->exit_code;
> hh->exit_signal = t->exit_signal;
>
> + hh->vpid = task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns);
> + hh->children_nr = cr_count_children(ctx, t);
> hh->task_comm_len = TASK_COMM_LEN;

Same here (hmm... well, if it weren't skipped below, actually...)

>
> /* FIXME: save remaining relevant task_struct fields */
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c

[...]

> +static int restart_thread(void *arg)
> +{
> + struct thr_context *thr_ctx = arg;
> + struct cr_ctx *ctx;
> + struct cr_hdr_task *ht;
> + int ret;
> + int i;
> +
> + current->state = TASK_UNINTERRUPTIBLE;

Why do you not want it to be interruptible ?

> +
> + ctx = thr_ctx->ctx;
> + ht = thr_ctx->ht;
> +
> + if (ht->vpid == 1) {
> + ctx->root_task = current;
> + ctx->root_nsproxy = current->nsproxy;
> +
> + get_task_struct(ctx->root_task);
> + get_nsproxy(ctx->root_nsproxy);
> + }
> +
> + ret = cr_rstr_task_struct(ctx, ht);
> + cr_debug("rstr_task_struct: ret %d\n", ret);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_mm(ctx);
> + cr_debug("memory: ret %d\n", ret);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_files(ctx);
> + cr_debug("files: ret %d\n", ret);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_thread(ctx);
> + cr_debug("thread: ret %d\n", ret);
> + if (ret < 0)
> + goto out;
> + ret = cr_read_cpu(ctx);
> + cr_debug("cpu: ret %d\n", ret);
> +
> + for (i = 0; i < ht->children_nr; i++) {
> + ret = cr_restart_process(ctx);
> + if (ret < 0)
> + break;
> + }

Here, eventually we'll need the pids of those processes; instead of
keeping 'ht->children_nr', you could loop through the pids array in
the ctx and create those who have their parent set to us.

> +
> +out:
> + thr_ctx->error = ret;
> + complete(&thr_ctx->complete);
> +
> + if (!ret && (ht->state & (EXIT_ZOMBIE|EXIT_DEAD))) {
> + do_exit(ht->exit_code);
> + } else {
> + __set_current_state(TASK_UNINTERRUPTIBLE);
> + }
> + schedule();

Using TASK_UNINTERRUPTIBLE, may keep a task unkill-able for potentially
long time. Any reason not to use TASK_INTERRUPTIBLE ?

> +
> + cr_debug("leaked %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
> +
> + complete_and_exit(NULL, 0);
> + return ret;
> +}
> +
> +static int cr_restart_process(struct cr_ctx *ctx)
> +{
> + struct thr_context thr_ctx;
> + struct task_struct *tsk;
> + struct cr_hdr_task *ht = cr_hbuf_get(ctx, sizeof(*ht));
> + int pid, parent, ret = -EINVAL;
> +
> + thr_ctx.ctx = ctx;
> + thr_ctx.error = 0;
> + init_completion(&thr_ctx.complete);
> +
> + parent = cr_read_obj_type(ctx, ht, sizeof(*ht), CR_HDR_TASK);
> + if (parent < 0) {
> + ret = parent;
> + goto out;
> + } else if (parent != 0)
> + goto out;
> +
> + thr_ctx.ht = ht;
> +
> + if (ht->vpid == 1) {
> + /* We should also create container here */
> + pid = cr_kernel_thread(restart_thread, &thr_ctx,
> + CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET, 0);

why do you want to start the container in the kernel ?

instead, we can create a new containers in user space, and have the init task
call the restart syscall from inside there.


> + } else {
> + /* We should fork here a child with saved pid and
> + correct flags */
> + pid = cr_kernel_thread(restart_thread, &thr_ctx, 0, ht->vpid);
> + }
> + if (pid < 0) {
> + ret = pid;
> + goto out;
> + }
> + read_lock(&tasklist_lock);
> + tsk = find_task_by_vpid(pid);
> + if (tsk)
> + get_task_struct(tsk);
> + read_unlock(&tasklist_lock);
> + if (tsk == NULL) {
> + ret = -ESRCH;
> + goto out;
> + }
> +
> + wait_for_completion(&thr_ctx.complete);
> + wait_task_inactive(tsk, 0);
> + ret = thr_ctx.error;
> + put_task_struct(tsk);
> +
> +out:
> + cr_hbuf_put(ctx, sizeof(*ht));
> + return ret;
> +}
> +
>
> int do_restart_in_kernel(struct cr_ctx *ctx)
> {
> - return -ENOSYS;
> + int ret, size, parent;
> + struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
> +
> + ret = cr_read_head(ctx);
> + if (ret < 0)
> + goto out;
> +
> + ret = -EINVAL;
> + parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
> + if (parent < 0) {
> + ret = parent;
> + goto out;
> + } else if (parent != 0)
> + goto out;
> +
> + size = sizeof(*ctx->pids_arr) * hh->tasks_nr;
> + if (size < 0)
> + goto out;
> + ctx->file->f_pos += size;

You can't do that - the file may be non-seekable (e.g. a socket); must read
the data in.
Besides, if instead of skipping this data you read it in, then you could use
it as noted above.

> +
> + ret = cr_restart_process(ctx);
> + if (ret < 0)
> + goto out;
> +
> + ret = cr_read_tail(ctx);

[...]

Thanks,

Oren.

2008-11-26 04:56:18

by Andrey Mirkin

[permalink] [raw]
Subject: Re: [PATCH 1/2] Add flags for user-space and in-kernel process creation

On Monday 24 November 2008 19:02 Louis Rilling wrote:
> On 24/11/08 18:39 +0300, Andrey Mirkin wrote:
> > Introduce 2 flags for user-space and in-kernel process creation during
> > restart procedure.
> > Also a stub function for in-kernel process restart is introduced.
>
> [...]
>
> > diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> > index 7745500..e4a9287 100644
> > --- a/checkpoint/sys.c
> > +++ b/checkpoint/sys.c
> > @@ -264,8 +264,8 @@ asmlinkage long sys_restart(int crid, int fd,
> > unsigned long flags) pid_t pid;
> > int ret;
> >
> > - /* no flags for now */
> > - if (flags)
> > + if ((flags & (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE)) ==
> > + (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_USERSPACE))
>
> I guess that the intent was:
> + if ((flags & (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_KERNEL)) ==
> + (CR_CTX_RSTR_IN_USERSPACE | CR_CTX_RSTR_IN_KERNEL))
>
> ?

Oh, of course it should be like this.
I've sent wrong patch. Will resend correct one.

Andrey

2008-11-26 05:08:44

by Andrey Mirkin

[permalink] [raw]
Subject: Re: [PATCH 2/2] Add support for in-kernel process creation during restart

On Tuesday 25 November 2008 03:45 Alexey Dobriyan wrote:
> On Mon, Nov 24, 2008 at 06:39:35PM +0300, Andrey Mirkin wrote:
> > +static int cr_rstr_task_struct(struct cr_ctx *ctx, struct cr_hdr_task
> > *ht) +{
> > + struct task_struct *t = current;
> > + char *buf;
> > + int ret = -EINVAL;
> > +
> > + /* upper limit for task_comm_len to prevent DoS */
> > + if (ht->task_comm_len < 0 || ht->task_comm_len > PAGE_SIZE)
> > + goto out;
> > +
> > + buf = kmalloc(ht->task_comm_len, GFP_KERNEL);
> > + if (!buf)
> > + goto out;
> > + ret = cr_read_string(ctx, buf, ht->task_comm_len);
> > + if (!ret) {
> > + /* if t->comm is too long, silently truncate */
> > + memset(t->comm, 0, TASK_COMM_LEN);
> > + memcpy(t->comm, buf, min(ht->task_comm_len, TASK_COMM_LEN));
> > + }
> > + kfree(buf);
> > +
> > + /* FIXME: restore remaining relevant task_struct fields */
> > +out:
> > + return ret;
> > +}
>
> ->comm is only 16 bytes wide, you can just use on-stack variable.
Yes, you right here.
But who knows how it can be changed later.
Also we have almost the same function for process restore from user space.
In next version I will use it instead of introducing new one.

Andrey

2008-11-26 11:44:41

by Andrey Mirkin

[permalink] [raw]
Subject: Re: [PATCH 0/2] In-kernel process restart

On Tuesday 25 November 2008 23:02 Oren Laadan wrote:
> Andrey Mirkin wrote:
> > These patchset introduces OpenVZ kernel based restart procedure on top of
> > Oren's checkpoint/restart patchset v9.
> >
> > For restarting a set of processes one will need to call sys_restart()
> > once with new flag CR_CTX_RSTR_IN_KERNEL. All work will be done in kernel
> > in this case.
> >
> > Small changes to image format are required to make in-kernel process
> > creation more easy.
> >
> > Oren, please take a look on this patchset. I've tried to port OpenVZ
> > functionality on top of yours with minimal changes.
>
> Thanks, Andrey. The patch looks simple and good.
>
> Did you look at my (user-space) process creation code ? specifically, it
> uses the task-pids array (that is saved during checkpoint) to figure out
> how many children a task has to spawn and their corresponding pids.

Yes, I've take a look on mktree tool. The algorithm of process creation is
quite clear.

> Is there a particular reason that you chose not to use that data during
> restart ? eventually, you'll need to add the rest of it, too, to the task
> data, like you did with the number of children in this patch.

Agree, I'll rework my patchset to use already existing data.

> Instead, since the data is available and the logic is available, we can
> have each new thread call do_restart_task(), and the container init
> should call do_restart_root().
>
> What do you think ? I can go ahead and make the changes on top of your
> patch.

I'll take a closer look on these functions and maybe we can use them for
created threads too. But from our experience during in-kernel process
creation and restore it is needed to perfrorm some restore procedures when
process is inactive. That is why I think we can't reuse all existing code for
in-kernel restart. Anyway I'll check everything once again.

Andrey

2008-11-26 11:58:20

by Andrey Mirkin

[permalink] [raw]
Subject: Re: [PATCH 2/2] Add support for in-kernel process creation during restart

On Tuesday 25 November 2008 23:17 Oren Laadan wrote:
> Hi,
>
> Andrey Mirkin wrote:
> > All work (process tree creation and process state restore) now can be
> > done in kernel.
> >
> > Task structure in image file is extended with 2 fields to make in-kernel
> > process creation more easy.
> >
> > Signed-off-by: Andrey Mirkin <[email protected]>
> > ---
> > checkpoint/checkpoint.c | 17 ++++
> > checkpoint/restart.c | 4 +-
> > checkpoint/rstr_process.c | 201
> > +++++++++++++++++++++++++++++++++++++++- include/linux/checkpoint.h |
> > 2 +
> > include/linux/checkpoint_hdr.h | 2 +
> > 5 files changed, 223 insertions(+), 3 deletions(-)
> >
> > diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> > index 04b0c4a..ae3326e 100644
> > --- a/checkpoint/checkpoint.c
> > +++ b/checkpoint/checkpoint.c
> > @@ -173,6 +173,21 @@ static int cr_write_tail(struct cr_ctx *ctx)
> > return ret;
> > }
> >
> > +static int cr_count_children(struct cr_ctx *ctx, struct task_struct
> > *tsk) +{
> > + int num = 0;
> > + struct task_struct *child;
> > +
> > + read_lock(&tasklist_lock);
> > + list_for_each_entry(child, &tsk->children, sibling) {
> > + if (child->parent != tsk)
> > + continue;
> > + num++;
> > + }
> > + read_unlock(&tasklist_lock);
> > + return num;
> > +}
> > +
>
> This information (and the pids of children) is already available in
> the ctx->pids_arr.

Yes, as I said in my previous e-mail I'll rework this patch to use this data.

> > /* dump the task_struct of a given task */
> > static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct
> > *t) {
> > @@ -189,6 +204,8 @@ static int cr_write_task_struct(struct cr_ctx *ctx,
> > struct task_struct *t) hh->exit_code = t->exit_code;
> > hh->exit_signal = t->exit_signal;
> >
> > + hh->vpid = task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns);
> > + hh->children_nr = cr_count_children(ctx, t);
> > hh->task_comm_len = TASK_COMM_LEN;
>
> Same here (hmm... well, if it weren't skipped below, actually...)
>
> > /* FIXME: save remaining relevant task_struct fields */
> > diff --git a/checkpoint/restart.c b/checkpoint/restart.c
>
> [...]
>
> > +static int restart_thread(void *arg)
> > +{
> > + struct thr_context *thr_ctx = arg;
> > + struct cr_ctx *ctx;
> > + struct cr_hdr_task *ht;
> > + int ret;
> > + int i;
> > +
> > + current->state = TASK_UNINTERRUPTIBLE;
>
> Why do you not want it to be interruptible ?

To be sure that nobody will interrupt our process and it will finish all the
work it should perform while restore.

> > +
> > + ctx = thr_ctx->ctx;
> > + ht = thr_ctx->ht;
> > +
> > + if (ht->vpid == 1) {
> > + ctx->root_task = current;
> > + ctx->root_nsproxy = current->nsproxy;
> > +
> > + get_task_struct(ctx->root_task);
> > + get_nsproxy(ctx->root_nsproxy);
> > + }
> > +
> > + ret = cr_rstr_task_struct(ctx, ht);
> > + cr_debug("rstr_task_struct: ret %d\n", ret);
> > + if (ret < 0)
> > + goto out;
> > + ret = cr_read_mm(ctx);
> > + cr_debug("memory: ret %d\n", ret);
> > + if (ret < 0)
> > + goto out;
> > + ret = cr_read_files(ctx);
> > + cr_debug("files: ret %d\n", ret);
> > + if (ret < 0)
> > + goto out;
> > + ret = cr_read_thread(ctx);
> > + cr_debug("thread: ret %d\n", ret);
> > + if (ret < 0)
> > + goto out;
> > + ret = cr_read_cpu(ctx);
> > + cr_debug("cpu: ret %d\n", ret);
> > +
> > + for (i = 0; i < ht->children_nr; i++) {
> > + ret = cr_restart_process(ctx);
> > + if (ret < 0)
> > + break;
> > + }
>
> Here, eventually we'll need the pids of those processes; instead of
> keeping 'ht->children_nr', you could loop through the pids array in
> the ctx and create those who have their parent set to us.

Well, it will require a little bit more time, but in this case I can reuse
data about process tree from dump file.
Ok, I'll rework my patch.

> > +
> > +out:
> > + thr_ctx->error = ret;
> > + complete(&thr_ctx->complete);
> > +
> > + if (!ret && (ht->state & (EXIT_ZOMBIE|EXIT_DEAD))) {
> > + do_exit(ht->exit_code);
> > + } else {
> > + __set_current_state(TASK_UNINTERRUPTIBLE);
> > + }
> > + schedule();
>
> Using TASK_UNINTERRUPTIBLE, may keep a task unkill-able for potentially
> long time. Any reason not to use TASK_INTERRUPTIBLE ?

In OpenVZ we are moving tasks to uninterruptible state to be sure that they
won't be killed. If something went wrong we move them to interruptible state
and kill on error path. If undump is finished successfully then we move them
to interruptible state during resume stage.

> > +
> > + cr_debug("leaked %d/%d %p\n", task_pid_nr(current),
> > task_pid_vnr(current), current->mm); +
> > + complete_and_exit(NULL, 0);
> > + return ret;
> > +}
> > +
> > +static int cr_restart_process(struct cr_ctx *ctx)
> > +{
> > + struct thr_context thr_ctx;
> > + struct task_struct *tsk;
> > + struct cr_hdr_task *ht = cr_hbuf_get(ctx, sizeof(*ht));
> > + int pid, parent, ret = -EINVAL;
> > +
> > + thr_ctx.ctx = ctx;
> > + thr_ctx.error = 0;
> > + init_completion(&thr_ctx.complete);
> > +
> > + parent = cr_read_obj_type(ctx, ht, sizeof(*ht), CR_HDR_TASK);
> > + if (parent < 0) {
> > + ret = parent;
> > + goto out;
> > + } else if (parent != 0)
> > + goto out;
> > +
> > + thr_ctx.ht = ht;
> > +
> > + if (ht->vpid == 1) {
> > + /* We should also create container here */
> > + pid = cr_kernel_thread(restart_thread, &thr_ctx,
> > + CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> > + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET, 0);
>
> why do you want to start the container in the kernel ?
>
> instead, we can create a new containers in user space, and have the init
> task call the restart syscall from inside there.

The idea is simple - do _all_ the work in the kernel. That is why creation of
container also done in the kernel.

> > + } else {
> > + /* We should fork here a child with saved pid and
> > + correct flags */
> > + pid = cr_kernel_thread(restart_thread, &thr_ctx, 0, ht->vpid);
> > + }
> > + if (pid < 0) {
> > + ret = pid;
> > + goto out;
> > + }
> > + read_lock(&tasklist_lock);
> > + tsk = find_task_by_vpid(pid);
> > + if (tsk)
> > + get_task_struct(tsk);
> > + read_unlock(&tasklist_lock);
> > + if (tsk == NULL) {
> > + ret = -ESRCH;
> > + goto out;
> > + }
> > +
> > + wait_for_completion(&thr_ctx.complete);
> > + wait_task_inactive(tsk, 0);
> > + ret = thr_ctx.error;
> > + put_task_struct(tsk);
> > +
> > +out:
> > + cr_hbuf_put(ctx, sizeof(*ht));
> > + return ret;
> > +}
> > +
> >
> > int do_restart_in_kernel(struct cr_ctx *ctx)
> > {
> > - return -ENOSYS;
> > + int ret, size, parent;
> > + struct cr_hdr_tree *hh = cr_hbuf_get(ctx, sizeof(*hh));
> > +
> > + ret = cr_read_head(ctx);
> > + if (ret < 0)
> > + goto out;
> > +
> > + ret = -EINVAL;
> > + parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TREE);
> > + if (parent < 0) {
> > + ret = parent;
> > + goto out;
> > + } else if (parent != 0)
> > + goto out;
> > +
> > + size = sizeof(*ctx->pids_arr) * hh->tasks_nr;
> > + if (size < 0)
> > + goto out;
> > + ctx->file->f_pos += size;
>
> You can't do that - the file may be non-seekable (e.g. a socket); must read
> the data in.
> Besides, if instead of skipping this data you read it in, then you could
> use it as noted above.

Agree, I'll rework my patch that way.

Thanks,
Andrey