Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752829AbYH2DfV (ORCPT ); Thu, 28 Aug 2008 23:35:21 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751382AbYH2DfG (ORCPT ); Thu, 28 Aug 2008 23:35:06 -0400 Received: from e4.ny.us.ibm.com ([32.97.182.144]:50556 "EHLO e4.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751276AbYH2DfC (ORCPT ); Thu, 28 Aug 2008 23:35:02 -0400 Subject: Re: [RFC v2][PATCH 1/9] checkpoint-restart: general infrastructure From: Matt Helsley To: Dave Hansen Cc: arnd@arndb.de, jeremy@goop.org, containers@lists.linux-foundation.org, linux-kernel@vger.kernel.org In-Reply-To: <20080820192558.98A5056E@nimitz> References: <20080820192557.98788FAB@nimitz> <20080820192558.98A5056E@nimitz> Content-Type: text/plain Organization: IBM Linux Technology Center Date: Thu, 28 Aug 2008 20:34:58 -0700 Message-Id: <1219980899.11632.35.camel@localhost.localdomain> Mime-Version: 1.0 X-Mailer: Evolution 2.22.3.1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 24283 Lines: 893 On Wed, 2008-08-20 at 12:25 -0700, Dave Hansen wrote: > This patch adds those interfaces, as well as all of the helpers > needed to easily manage the file format. > > The code is roughly broken out as follows: > > ckpt/sys.c - user/kernel data transfer, as well as setting up of the > checkpoint/restart context (a per-checkpoint data > structure for housekeeping) > ckpt/checkpoint.c - output wrappers and basic checkpoint handling > ckpt/restart.c - input wrappers and basic restart handling > > Patches to add the per-architecture support as well as the actual > work to do the memory checkpoint follow in subsequent patches. > > changes from last version: > - Moved over to pr_debug() from CR_PRINTK() > - Moved magic number over to linux/magic.h > > TODO: > - Investigate using anon_inodes for the sys_checkpoint() side > - Move all the structure declarations to somewhere that we > can easily export them to userspace. > - Lots of ABI issues to work out. > > Signed-off-by: Oren Laadan > --- > > oren-cr.git-dave/Makefile | 2 > oren-cr.git-dave/checkpoint/Makefile | 1 > oren-cr.git-dave/checkpoint/checkpoint.c | 208 +++++++++++++++++++++++++++ > oren-cr.git-dave/checkpoint/ckpt.h | 71 +++++++++ > oren-cr.git-dave/checkpoint/ckpt_hdr.h | 69 +++++++++ > oren-cr.git-dave/checkpoint/restart.c | 190 +++++++++++++++++++++++++ > oren-cr.git-dave/checkpoint/sys.c | 233 +++++++++++++++++++++++++++++++ > oren-cr.git-dave/include/linux/magic.h | 2 > 8 files changed, 775 insertions(+), 1 deletion(-) > > diff -puN /dev/null checkpoint/checkpoint.c > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/checkpoint.c 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1,208 @@ > +/* > + * Checkpoint logic and helpers > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "ckpt.h" > +#include "ckpt_hdr.h" > + > +/** > + * cr_get_fname - return pathname of a given file > + * @file: file pointer > + * @buf: buffer for pathname > + * @n: buffer length (in) and pathname length (out) > + * > + * if the buffer provivded by the caller is too small, allocate a new > + * buffer; caller should call cr_put_pathname() for cleanup > + */ > +char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n) > +{ > + char *fname; > + > + fname = __d_path(path, root, buf, *n); > + > + if (IS_ERR(fname) && PTR_ERR(fname) == -ENAMETOOLONG) { > + if (!(buf = (char *) __get_free_pages(GFP_KERNEL, 0))) > + return ERR_PTR(-ENOMEM); > + fname = __d_path(path, root, buf, PAGE_SIZE); > + if (IS_ERR(fname)) > + free_pages((unsigned long) buf, 0); > + } > + if (!IS_ERR(fname)) > + *n = (buf + *n - fname); > + > + return fname; > +} > + > +/** > + * cr_put_fname - (possibly) cleanup pathname buffer > + * @buf: original buffer that was given to cr_get_pathname() > + * @fname: resulting pathname from cr_get_pathname() > + * @n: length of original buffer > + */ > +void cr_put_fname(char *buf, char *fname, int n) > +{ > + if (fname && (fname < buf || fname >= buf + n)) > + free_pages((unsigned long) buf, 0); > +} > + > +/** > + * cr_write_obj - write a record described by a cr_hdr > + * @ctx: checkpoint context > + * @h: record descriptor > + * @buf: record buffer > + */ > +int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf) > +{ > + int ret; > + > + if ((ret = cr_kwrite(ctx, h, sizeof(*h))) < 0) > + return ret; > + return cr_kwrite(ctx, buf, h->len); > +} > + > +/** > + * cr_write_str - write a string record > + * @ctx: checkpoint context > + * @str: string buffer > + * @n: string length > + */ > +int cr_write_str(struct cr_ctx *ctx, char *str, int n) > +{ > + struct cr_hdr h; > + > + h.type = CR_HDR_STR; > + h.len = n; > + h.id = 0; > + > + return cr_write_obj(ctx, &h, str); > +} > + > +/* write the checkpoint header */ > +static int cr_write_hdr(struct cr_ctx *ctx) > +{ > + struct cr_hdr h; > + struct cr_hdr_head *hh = ctx->tbuf; > + struct timeval ktv; > + > + h.type = CR_HDR_HEAD; > + h.len = sizeof(hh); > + h.id = 0; > + > + do_gettimeofday(&ktv); > + > + hh->magic = CR_HEADER_MAGIC; > + hh->major = (LINUX_VERSION_CODE >> 16) & 0xff; > + hh->minor = (LINUX_VERSION_CODE >> 8) & 0xff; > + hh->patch = (LINUX_VERSION_CODE) & 0xff; > + > + hh->version = 1; > + > + hh->flags = ctx->flags; > + hh->time = ktv.tv_sec; > + > + return cr_write_obj(ctx, &h, hh); > +} > + > +/* write the checkpoint trailer */ > +static int cr_write_tail(struct cr_ctx *ctx) > +{ > + struct cr_hdr h; > + struct cr_hdr_tail *hh = ctx->tbuf; > + > + h.type = CR_HDR_TAIL; > + h.len = sizeof(*hh); > + h.id = 0; > + > + hh->magic = CR_HEADER_MAGIC; > + hh->cksum[0] = hh->cksum[1] = 1; /* TBD ... */ > + > + return cr_write_obj(ctx, &h, hh); > +} > + > +/* dump the task_struct of a given task */ > +static int cr_write_task_struct(struct cr_ctx *ctx, struct task_struct *t) > +{ > + struct cr_hdr h; > + struct cr_hdr_task *hh = ctx->tbuf; > + > + h.type = CR_HDR_TASK; > + h.len = sizeof(*hh); > + h.id = ctx->pid; > + > + hh->state = t->state; > + hh->exit_state = t->exit_state; > + hh->exit_code = t->exit_code; > + hh->exit_signal = t->exit_signal; > + > + hh->pid = t->pid; > + hh->tgid = t->tgid; > + > + hh->utime = t->utime; > + hh->stime = t->stime; > + hh->utimescaled = t->utimescaled; > + hh->stimescaled = t->stimescaled; > + hh->gtime = t->gtime; > + hh->prev_utime = t->prev_utime; > + hh->prev_stime = t->prev_stime; > + hh->nvcsw = t->nvcsw; > + hh->nivcsw = t->nivcsw; > + hh->start_time_sec = t->start_time.tv_sec; > + hh->start_time_nsec = t->start_time.tv_nsec; > + hh->real_start_time_sec = t->real_start_time.tv_sec; > + hh->real_start_time_nsec = t->real_start_time.tv_nsec; > + hh->min_flt = t->min_flt; > + hh->maj_flt = t->maj_flt; > + > + hh->task_comm_len = TASK_COMM_LEN; > + memcpy(hh->comm, t->comm, TASK_COMM_LEN); > + > + return cr_write_obj(ctx, &h, hh); > +} > + > +/* dump the entire state of a given task */ > +static int cr_write_task(struct cr_ctx *ctx, struct task_struct *t) > +{ > + int ret ; > + > + BUG_ON(t->state == TASK_DEAD); > + > + ret = cr_write_task_struct(ctx, t); > + pr_debug("ret (task_struct) %d\n", ret); > + > + return ret; > +} > + > +int do_checkpoint(struct cr_ctx *ctx) > +{ > + int ret; > + > + /* FIX: need to test whether container is checkpointable */ > + > + ret = cr_write_hdr(ctx); > + if (!ret) > + ret = cr_write_task(ctx, current); > + if (!ret) > + ret = cr_write_tail(ctx); > + > + /* on success, return (unique) checkpoint identifier */ > + if (!ret) > + ret = ctx->crid; > + > + return ret; How about conforming to existing kernel style by inverting the ret tests and using goto here: ret = cr_write_hdr(ctx); if (ret) goto out; ret = cr_write_task(ctx, current); if (ret) goto out; ret = cr_write_tail(ctx); if (ret) goto out; ret = ctx->crid; out: return ret; That means that if we aren't always assigning to ret we won't always need to (re)test it. Granted the compiler output is probably the same, but I think this is much more readable by reviewers. It may also be easier to maintain since you won't have to test ret before adding new code here. > +} > diff -puN /dev/null checkpoint/ckpt.h > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/ckpt.h 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1,71 @@ > +#ifndef _CKPT_CKPT_H_ > +#define _CKPT_CKPT_H_ > +/* > + * Generic container checkpoint-restart > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include > +#include > + > +struct cr_pgarr; > + > +struct cr_ctx { > + pid_t pid; /* container identifier */ > + int crid; /* unique checkpoint id */ > + > + unsigned long flags; > + unsigned long oflags; /* restart: old flags */ > + > + struct file *file; > + int total; /* total read/written */ > + > + void *tbuf; /* temp: to avoid many alloc/dealloc */ > + void *hbuf; /* header: to avoid many alloc/dealloc */ > + int hpos; > + > + struct cr_pgarr *pgarr; > + struct cr_pgarr *pgcur; > + > + struct path *vfsroot; /* container root */ > +}; > + > +/* cr_ctx: flags */ > +#define CR_CTX_CKPT 0x1 > +#define CR_CTX_RSTR 0x2 > + > +/* allocation defaults */ > +#define CR_ORDER_TBUF 1 > +#define CR_ORDER_HBUF 1 > + > +#define CR_TBUF_TOTAL ((PAGE_SIZE << CR_ORDER_TBUF) / sizeof(void *)) > +#define CR_HBUF_TOTAL ((PAGE_SIZE << CR_ORDER_HBUF) / sizeof(void *)) > + > +extern void cr_put_fname(char *buf, char *fname, int n); > +extern char *cr_get_fname(struct path *path, struct path *root, char *buf, int *n); > + > +extern int cr_uwrite(struct cr_ctx *ctx, void *buf, int count); > +extern int cr_kwrite(struct cr_ctx *ctx, void *buf, int count); > +extern int cr_uread(struct cr_ctx *ctx, void *buf, int count); > +extern int cr_kread(struct cr_ctx *ctx, void *buf, int count); > + > +struct cr_hdr; > + > +extern int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf); > +extern int cr_write_str(struct cr_ctx *ctx, char *str, int n); > +extern int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t); > + > +extern int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n); > +extern int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type); > +extern int cr_read_str(struct cr_ctx *ctx, void *str, int n); > +extern int cr_read_mm(struct cr_ctx *ctx); > + > +extern int do_checkpoint(struct cr_ctx *ctx); > +extern int do_restart(struct cr_ctx *ctx); > + > +#endif /* _CKPT_CKPT_H_ */ > diff -puN /dev/null checkpoint/ckpt_hdr.h > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/ckpt_hdr.h 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1,69 @@ > +/* > + * Generic container checkpoint-restart > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include > + > +struct cr_hdr { > + __s16 type; > + __s16 len; > + __u32 id; > +}; > + > +enum { > + CR_HDR_HEAD = 1, > + CR_HDR_STR, > + > + CR_HDR_TASK = 101, > + CR_HDR_THREAD, > + CR_HDR_CPU, > + > + CR_HDR_MM = 201, > + CR_HDR_VMA, > + CR_HDR_MM_CONTEXT, > + > + CR_HDR_TAIL = 5001 > +}; > + > +struct cr_hdr_head { > + __u32 magic; > + __u16 major; > + __u16 minor; > + __u16 patch; > + __u16 version; > + __u32 flags; /* checkpoint options */ > + __u64 time; /* when checkpoint taken */ > +}; > + > +struct cr_hdr_tail { > + __u32 magic; > + __u32 cksum[2]; > +}; > + > +struct cr_hdr_task { > + __u64 state; > + __u32 exit_state; > + __u32 exit_code, exit_signal; > + > + __u16 pid; > + __u16 tgid; > + > + __u64 utime, stime, utimescaled, stimescaled; > + __u64 gtime; > + __u64 prev_utime, prev_stime; > + __u64 nvcsw, nivcsw; > + __u64 start_time_sec, start_time_nsec; > + __u64 real_start_time_sec, real_start_time_nsec; > + __u64 min_flt, maj_flt; > + > + __s16 task_comm_len; > + char comm[TASK_COMM_LEN]; > +}; > + > + > diff -puN /dev/null checkpoint/Makefile > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/Makefile 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1 @@ > +obj-y += sys.o checkpoint.o restart.o > diff -puN /dev/null checkpoint/restart.c > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/restart.c 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1,190 @@ > +/* > + * Restart logic and helpers > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +/* > + * During restart the code reads in data from the chekcpoint image into a > + * temporary buffer (ctx->hbuf). Because operations can be nested, one > + * should call cr_hbuf_get() to reserve space in the buffer, and then > + * cr_hbuf_put() when it no longer needs that space > + */ > + > +#include > +#include > +#include > +#include > + > +#include "ckpt.h" > +#include "ckpt_hdr.h" > + > +/** > + * cr_hbuf_get - reserve space on the hbuf > + * @ctx: checkpoint context > + * @n: number of bytes to reserve > + */ > +void *cr_hbuf_get(struct cr_ctx *ctx, int n) > +{ > + void *ptr; > + > + BUG_ON(ctx->hpos + n > CR_HBUF_TOTAL); > + ptr = (void *) (((char *) ctx->hbuf) + ctx->hpos); > + ctx->hpos += n; > + return ptr; > +} > + > +/** > + * cr_hbuf_put - unreserve space on the hbuf > + * @ctx: checkpoint context > + * @n: number of bytes to reserve > + */ > +void cr_hbuf_put(struct cr_ctx *ctx, int n) > +{ > + BUG_ON(ctx->hpos < n); > + ctx->hpos -= n; > +} > + > +/** > + * cr_read_obj - read a whole record (cr_hdr followed by payload) > + * @ctx: checkpoint context > + * @h: record descriptor > + * @buf: record buffer > + * @n: available buffer size > + */ > +int cr_read_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf, int n) > +{ > + int ret; > + > + ret = cr_kread(ctx, h, sizeof(*h)); > + if (ret < 0) > + return ret; > + > + pr_debug("type %d len %d id %d (%d)\n", h->type, h->len, h->id, n); > + if (h->len < 0 || h->len > n) > + return -EINVAL; > + > + return cr_kread(ctx, buf, h->len); > +} > + > +/** > + * cr_read_obj_type - read a whole record of expected type > + * @ctx: checkpoint context > + * @buf: record buffer > + * @n: available buffer size > + * @type: expected record type > + */ > +int cr_read_obj_type(struct cr_ctx *ctx, void *buf, int n, int type) > +{ > + struct cr_hdr h; > + int ret; > + > + ret = cr_read_obj(ctx, &h, buf, n); > + if (!ret) > + ret = (h.type == type ? h.id : -EINVAL); > + return ret; > +} > + > +/** > + * cr_read_str - read a string record > + * @ctx: checkpoint context > + * @str: string buffer > + * @n: string length > + */ > +int cr_read_str(struct cr_ctx *ctx, void *str, int n) > +{ > + return cr_read_obj_type(ctx, str, n, CR_HDR_STR); > +} > + > +/* read the checkpoint header */ > +static int cr_read_hdr(struct cr_ctx *ctx) > +{ > + struct cr_hdr_head *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + int ret; > + > + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_HEAD); > + if (ret < 0) > + return ret; > + > + if (hh->magic != CR_HEADER_MAGIC || hh->version != 1 || > + hh->major != ((LINUX_VERSION_CODE >> 16) & 0xff) || > + hh->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) || > + hh->patch != ((LINUX_VERSION_CODE) & 0xff)) > + return -EINVAL; > + > + if (hh->flags & ~CR_CTX_CKPT) > + return -EINVAL; > + > + ctx->oflags = hh->flags; > + > + cr_hbuf_put(ctx, sizeof(*hh)); > + return 0; > +} > + > +/* read the checkpoint trailer */ > +static int cr_read_tail(struct cr_ctx *ctx) > +{ > + struct cr_hdr_tail *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + int ret; > + > + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TAIL); > + if (ret < 0) > + return ret; > + > + if (hh->magic != CR_HEADER_MAGIC || > + hh->cksum[0] != 1 || hh->cksum[1] != 1) > + return -EINVAL; > + > + cr_hbuf_put(ctx, sizeof(*hh)); > + return 0; > +} > + > +/* read the task_struct into the current task */ > +static int cr_read_task_struct(struct cr_ctx *ctx) > +{ > + struct cr_hdr_task *hh = cr_hbuf_get(ctx, sizeof(*hh)); > + struct task_struct *t = current; > + int ret; > + > + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_TASK); > + if (ret < 0) > + return ret; > + > + /* for now, only restore t->comm */ > + if (hh->task_comm_len < 0 || hh->task_comm_len > TASK_COMM_LEN) > + return -EINVAL; > + > + memset(t->comm, 0, TASK_COMM_LEN); > + memcpy(t->comm, hh->comm, hh->task_comm_len); > + > + cr_hbuf_put(ctx, sizeof(*hh)); > + return 0; > +} > + > +/* read the entire state of the current task */ > +static int cr_read_task(struct cr_ctx *ctx) > +{ > + int ret; > + > + ret = cr_read_task_struct(ctx); > + pr_debug("ret (task_struct) %d\n", ret); > + > + return ret; > +} > + > +int do_restart(struct cr_ctx *ctx) > +{ > + int ret; > + > + ret = cr_read_hdr(ctx); > + if (!ret) > + ret = cr_read_task(ctx); > + if (!ret) > + ret = cr_read_tail(ctx); same comment > + > + return ret; > +} > diff -puN /dev/null checkpoint/sys.c > --- /dev/null 2008-04-22 10:49:52.000000000 -0700 > +++ oren-cr.git-dave/checkpoint/sys.c 2008-08-20 12:12:48.000000000 -0700 > @@ -0,0 +1,233 @@ > +/* > + * Generic container checkpoint-restart > + * > + * Copyright (C) 2008 Oren Laadan > + * > + * This file is subject to the terms and conditions of the GNU General Public > + * License. See the file COPYING in the main directory of the Linux > + * distribution for more details. > + */ > + > +#include > +#include > +#include > +#include > +#include > + > +#include "ckpt.h" > + > +/* > + * helpers to write/read to/from the image file descriptor > + * > + * cr_uwrite() - write a user-space buffer to the checkpoint image > + * cr_kwrite() - write a kernel-space buffer to the checkpoint image > + * cr_uread() - read from the checkpoint image to a user-space buffer > + * cr_kread() - read from the checkpoint image to a kernel-space buffer > + * > + */ > + > +/* (temporarily added file_pos_read() and file_pos_write() because they > + * are static in fs/read_write.c... should cleanup and remove later) */ > +static inline loff_t file_pos_read(struct file *file) > +{ > + return file->f_pos; > +} > + > +static inline void file_pos_write(struct file *file, loff_t pos) > +{ > + file->f_pos = pos; > +} > + > +int cr_uwrite(struct cr_ctx *ctx, void *buf, int count) > +{ > + struct file *file = ctx->file; > + ssize_t nwrite; > + int nleft; > + > + for (nleft = count; nleft; nleft -= nwrite) { > + loff_t pos = file_pos_read(file); > + nwrite = vfs_write(file, (char __user *) buf, nleft, &pos); > + file_pos_write(file, pos); > + if (unlikely(nwrite <= 0)) /* zero tolerance */ > + return (nwrite ? : -EIO); > + buf += nwrite; > + } > + > + ctx->total += count; > + return 0; > +} > + > +int cr_kwrite(struct cr_ctx *ctx, void *buf, int count) > +{ > + mm_segment_t oldfs; > + int ret; > + > + oldfs = get_fs(); > + set_fs(KERNEL_DS); > + ret = cr_uwrite(ctx, buf, count); > + set_fs(oldfs); > + > + return ret; > +} > + > +int cr_uread(struct cr_ctx *ctx, void *buf, int count) > +{ > + struct file *file = ctx->file; > + ssize_t nread; > + int nleft; > + > + for (nleft = count; nleft; nleft -= nread) { > + loff_t pos = file_pos_read(file); > + nread = vfs_read(file, (char __user *) buf, nleft, &pos); > + file_pos_write(file, pos); > + if (unlikely(nread <= 0)) /* zero tolerance */ > + return (nread ? : -EIO); > + buf += nread; > + } > + > + ctx->total += count; > + return 0; > +} > + > +int cr_kread(struct cr_ctx *ctx, void *buf, int count) > +{ > + mm_segment_t oldfs; > + int ret; > + > + oldfs = get_fs(); > + set_fs(KERNEL_DS); > + ret = cr_uread(ctx, buf, count); > + set_fs(oldfs); > + > + return ret; > +} > + > + > +/* > + * helpers to manage CR contexts: allocated for each checkpoint and/or > + * restart operation, and persists until the operation is completed. > + */ > + > +static atomic_t cr_ctx_count; /* unique checkpoint identifier */ > + > +void cr_ctx_free(struct cr_ctx *ctx) > +{ > + > + if (ctx->file) > + fput(ctx->file); > + if (ctx->vfsroot) > + path_put(ctx->vfsroot); > + > + free_pages((unsigned long) ctx->tbuf, CR_ORDER_TBUF); > + free_pages((unsigned long) ctx->hbuf, CR_ORDER_HBUF); > + > + kfree(ctx); > +} > + > +struct cr_ctx *cr_ctx_alloc(pid_t pid, struct file *file, unsigned long flags) > +{ > + struct cr_ctx *ctx; > + > + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); > + if (!ctx) > + return NULL; > + > + ctx->tbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_TBUF); > + ctx->hbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_HBUF); > + if (!ctx->tbuf || !ctx->hbuf) > + goto nomem; > + > + ctx->pid = pid; > + ctx->flags = flags; > + > + ctx->file = file; > + get_file(file); > + > + /* assume checkpointer is in container's root vfs */ > + ctx->vfsroot = ¤t->fs->root; > + path_get(ctx->vfsroot); > + > + ctx->crid = atomic_inc_return(&cr_ctx_count); > + > + return ctx; > + > + nomem: > + cr_ctx_free(ctx); > + return NULL; > +} > + > +/** > + * sys_checkpoint - checkpoint a container > + * @pid: pid of the container init(1) process > + * @fd: file to which dump the checkpoint image > + * @flags: checkpoint operation flags > + */ > +asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags) > +{ > + struct cr_ctx *ctx; > + struct file *file; > + int fput_needed; > + int ret; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + file = fget_light(fd, &fput_needed); > + if (!file) > + return -EBADF; > + > + /* no flags for now */ > + if (flags) > + return -EINVAL; > + > + ctx = cr_ctx_alloc(pid, file, flags | CR_CTX_CKPT); > + if (!ctx) { > + fput_light(file, fput_needed); > + return -ENOMEM; > + } > + > + ret = do_checkpoint(ctx); > + > + cr_ctx_free(ctx); > + fput_light(file, fput_needed); > + pr_debug("ckpt retval = %d\n", ret); > + return ret; > +} > + > +/** > + * sys_restart - restart a container > + * @crid: checkpoint image identifier > + * @fd: file from which read the checkpoint image > + * @flags: restart operation flags > + */ > +asmlinkage long sys_restart(int crid, int fd, unsigned long flags) > +{ > + struct cr_ctx *ctx; > + struct file *file; > + int fput_needed; > + int ret; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + file = fget_light(fd, &fput_needed); > + if (!file) > + return -EBADF; > + > + /* no flags for now */ > + if (flags) > + return -EINVAL; > + > + ctx = cr_ctx_alloc(crid, file, flags | CR_CTX_RSTR); > + if (!ctx) { > + fput_light(file, fput_needed); > + return -ENOMEM; > + } > + > + ret = do_restart(ctx); > + > + cr_ctx_free(ctx); > + fput_light(file, fput_needed); > + pr_debug("restart retval = %d\n", ret); > + return ret; > +} > diff -puN include/linux/magic.h~0001-checkpoint-restart-general-infrastructure include/linux/magic.h > --- oren-cr.git/include/linux/magic.h~0001-checkpoint-restart-general-infrastructure 2008-08-20 12:12:48.000000000 -0700 > +++ oren-cr.git-dave/include/linux/magic.h 2008-08-20 12:12:48.000000000 -0700 > @@ -42,4 +42,6 @@ > #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA > #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA > > +#define CR_HEADER_MAGIC 0x002d2a00 > + > #endif /* __LINUX_MAGIC_H__ */ > diff -puN Makefile~0001-checkpoint-restart-general-infrastructure Makefile > --- oren-cr.git/Makefile~0001-checkpoint-restart-general-infrastructure 2008-08-20 12:12:48.000000000 -0700 > +++ oren-cr.git-dave/Makefile 2008-08-20 12:12:48.000000000 -0700 > @@ -619,7 +619,7 @@ export mod_strip_cmd > > > ifeq ($(KBUILD_EXTMOD),) > -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ > +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ checkpoint/ > > vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ > $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ > _ > _______________________________________________ > Containers mailing list > Containers@lists.linux-foundation.org > https://lists.linux-foundation.org/mailman/listinfo/containers -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/