by Louis Rilling

[permalink] [raw]

Subject: Re: [PATCH 05/10] Introduce function to dump process

Hi,

On Sat, Oct 18, 2008 at 03:11:33AM +0400, Andrey Mirkin wrote:
> Functions to dump task struct, fpu state and registers are added.
> All IDs are saved from the POV of process (container) namespace.

Just a couple of little comments, in case this series should keep on living.

[...]

> diff --git a/checkpoint/cpt_process.c b/checkpoint/cpt_process.c
> new file mode 100644
> index 0000000..58f608d
> --- /dev/null
> +++ b/checkpoint/cpt_process.c
> @@ -0,0 +1,236 @@
> +/*
> + * Copyright (C) 2008 Parallels, Inc.
> + *
> + * Author: Andrey Mirkin <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation, version 2 of the
> + * License.
> + *
> + */
> +
> +#include <linux/sched.h>
> +#include <linux/fs.h>
> +#include <linux/file.h>
> +#include <linux/version.h>
> +#include <linux/nsproxy.h>
> +
> +#include "checkpoint.h"
> +#include "cpt_image.h"
> +
> +static unsigned int encode_task_flags(unsigned int task_flags)
> +{
> + unsigned int flags = 0;
> +
> + if (task_flags & PF_EXITING)
> + flags |= (1 << CPT_PF_EXITING);
> + if (task_flags & PF_FORKNOEXEC)
> + flags |= (1 << CPT_PF_FORKNOEXEC);
> + if (task_flags & PF_SUPERPRIV)
> + flags |= (1 << CPT_PF_SUPERPRIV);
> + if (task_flags & PF_DUMPCORE)
> + flags |= (1 << CPT_PF_DUMPCORE);
> + if (task_flags & PF_SIGNALED)
> + flags |= (1 << CPT_PF_SIGNALED);
> + if (task_flags & PF_USED_MATH)
> + flags |= (1 << CPT_PF_USED_MATH);
> +
> + return flags;
> +
> +}
> +
> +int cpt_dump_task_struct(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> + struct cpt_task_image *t;
> + int i;
> + int err;
> +
> + t = kzalloc(sizeof(*t), GFP_KERNEL);
> + if (!t)
> + return -ENOMEM;
> +
> + t->cpt_len = sizeof(*t);
> + t->cpt_type = CPT_OBJ_TASK;
> + t->cpt_hdrlen = sizeof(*t);
> + t->cpt_content = CPT_CONTENT_ARRAY;
> +
> + t->cpt_state = tsk->state;
> + t->cpt_flags = encode_task_flags(tsk->flags);
> + t->cpt_exit_code = tsk->exit_code;
> + t->cpt_exit_signal = tsk->exit_signal;
> + t->cpt_pdeath_signal = tsk->pdeath_signal;
> + t->cpt_pid = task_pid_nr_ns(tsk, ctx->nsproxy->pid_ns);
> + t->cpt_tgid = task_tgid_nr_ns(tsk, ctx->nsproxy->pid_ns);
> + t->cpt_ppid = tsk->parent ?
> + task_pid_nr_ns(tsk->parent, ctx->nsproxy->pid_ns) : 0;
> + t->cpt_rppid = tsk->real_parent ?
> + task_pid_nr_ns(tsk->real_parent, ctx->nsproxy->pid_ns) : 0;
> + t->cpt_pgrp = task_pgrp_nr_ns(tsk, ctx->nsproxy->pid_ns);
> + t->cpt_session = task_session_nr_ns(tsk, ctx->nsproxy->pid_ns);
> + t->cpt_old_pgrp = 0;
> + if (tsk->signal->tty_old_pgrp)
> + t->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp);
> + t->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0;

Why pid_vnr() here, and task_*_nr_ns() above? According to the introducing
comment, I'd expect something like pid_nr_ns(tsk->signal->tty_old_pgrp,
tsk->nsproxy->pid_ns), and the same for tsk->group_leader.

IIUC, pid_vnr() is correct only if ctx->nsproxy->pid_ns == tsk->nsproxy->pid_ns
== current->nsproxy->pid_ns, and I expect current to live in a different pid_ns.

Comments?

> + t->cpt_utime = tsk->utime;
> + t->cpt_stime = tsk->stime;
> + t->cpt_utimescaled = tsk->utimescaled;
> + t->cpt_stimescaled = tsk->stimescaled;
> + t->cpt_gtime = tsk->gtime;
> + t->cpt_prev_utime = tsk->prev_utime;
> + t->cpt_prev_stime = tsk->prev_stime;
> + t->cpt_nvcsw = tsk->nvcsw;
> + t->cpt_nivcsw = tsk->nivcsw;
> + t->cpt_start_time = cpt_timespec_export(&tsk->start_time);
> + t->cpt_real_start_time = cpt_timespec_export(&tsk->real_start_time);
> + t->cpt_min_flt = tsk->min_flt;
> + t->cpt_maj_flt = tsk->maj_flt;
> + memcpy(t->cpt_comm, tsk->comm, TASK_COMM_LEN);
> + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
> + t->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b) << 32) +
> + tsk->thread.tls_array[i].a;
> + }
> + /* TODO: encode thread flags and status like task flags */
> + t->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<<TIF_FREEZE);
> + t->cpt_thrstatus = task_thread_info(tsk)->status;
> + t->cpt_user = tsk->user->uid;
> + t->cpt_uid = tsk->uid;
> + t->cpt_euid = tsk->euid;
> + t->cpt_suid = tsk->suid;
> + t->cpt_fsuid = tsk->fsuid;
> + t->cpt_gid = tsk->gid;
> + t->cpt_egid = tsk->egid;
> + t->cpt_sgid = tsk->sgid;
> + t->cpt_fsgid = tsk->fsgid;
> +
> + err = ctx->write(t, sizeof(*t), ctx);
> +
> + kfree(t);
> + return err;
> +}
> +
> +static int cpt_dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> + struct cpt_obj_bits hdr;
> + int err;
> + int content;
> + unsigned long size;
> +
> + content = CPT_CONTENT_X86_FPUSTATE;
> + size = sizeof(struct i387_fxsave_struct);
> +#ifndef CONFIG_X86_64
> + if (!cpu_has_fxsr) {
> + size = sizeof(struct i387_fsave_struct);
> + content = CPT_CONTENT_X86_FPUSTATE_OLD;
> + }
> +#endif
> +
> + hdr.cpt_len = sizeof(hdr) + size;
> + hdr.cpt_type = CPT_OBJ_BITS;
> + hdr.cpt_hdrlen = sizeof(hdr);
> + hdr.cpt_content = content;
> + hdr.cpt_size = size;
> + err = ctx->write(&hdr, sizeof(hdr), ctx);
> + if (!err)
> + ctx->write(tsk->thread.xstate, size, ctx);

Should check the error code of the line above, right?

> + return err;
> +}
> +
> +static u32 encode_segment(u32 segreg)
> +{
> + segreg &= 0xFFFF;
> +
> + if (segreg == 0)
> + return CPT_SEG_ZERO;
> + if ((segreg & 3) != 3) {
> + eprintk("Invalid RPL of a segment reg %x\n", segreg);
> + return CPT_SEG_ZERO;
> + }
> +
> + /* LDT descriptor, it is just an index to LDT array */
> + if (segreg & 4)
> + return CPT_SEG_LDT + (segreg >> 3);
> +
> + /* TLS descriptor. */
> + if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN &&
> + (segreg >> 3) <= GDT_ENTRY_TLS_MAX)
> + return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN);
> +
> + /* One of standard desriptors */
> +#ifdef CONFIG_X86_64
> + if (segreg == __USER32_DS)
> + return CPT_SEG_USER32_DS;
> + if (segreg == __USER32_CS)
> + return CPT_SEG_USER32_CS;
> + if (segreg == __USER_DS)
> + return CPT_SEG_USER64_DS;
> + if (segreg == __USER_CS)
> + return CPT_SEG_USER64_CS;
> +#else
> + if (segreg == __USER_DS)
> + return CPT_SEG_USER32_DS;
> + if (segreg == __USER_CS)
> + return CPT_SEG_USER32_CS;
> +#endif
> + eprintk("Invalid segment reg %x\n", segreg);
> + return CPT_SEG_ZERO;
> +}
> +
> +static int cpt_dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> + struct cpt_x86_regs ri;
> + struct pt_regs *pt_regs;
> +
> + ri.cpt_len = sizeof(ri);
> + ri.cpt_type = CPT_OBJ_X86_REGS;
> + ri.cpt_hdrlen = sizeof(ri);
> + ri.cpt_content = CPT_CONTENT_VOID;
> +
> + ri.cpt_debugreg[0] = tsk->thread.debugreg0;
> + ri.cpt_debugreg[1] = tsk->thread.debugreg1;
> + ri.cpt_debugreg[2] = tsk->thread.debugreg2;
> + ri.cpt_debugreg[3] = tsk->thread.debugreg3;
> + ri.cpt_debugreg[4] = 0;
> + ri.cpt_debugreg[5] = 0;
> + ri.cpt_debugreg[6] = tsk->thread.debugreg6;
> + ri.cpt_debugreg[7] = tsk->thread.debugreg7;
> +
> + pt_regs = task_pt_regs(tsk);
> +
> + ri.cpt_fs = encode_segment(pt_regs->fs);
> + ri.cpt_gs = encode_segment(tsk->thread.gs);
> +
> + ri.cpt_bx = pt_regs->bx;
> + ri.cpt_cx = pt_regs->cx;
> + ri.cpt_dx = pt_regs->dx;
> + ri.cpt_si = pt_regs->si;
> + ri.cpt_di = pt_regs->di;
> + ri.cpt_bp = pt_regs->bp;
> + ri.cpt_ax = pt_regs->ax;
> + ri.cpt_ds = encode_segment(pt_regs->ds);
> + ri.cpt_es = encode_segment(pt_regs->es);
> + ri.cpt_orig_ax = pt_regs->orig_ax;
> + ri.cpt_ip = pt_regs->ip;
> + ri.cpt_cs = encode_segment(pt_regs->cs);
> + ri.cpt_flags = pt_regs->flags;
> + ri.cpt_sp = pt_regs->sp;
> + ri.cpt_ss = encode_segment(pt_regs->ss);
> +
> + return ctx->write(&ri, sizeof(ri), ctx);
> +}
> +
> +int cpt_dump_task(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> + int err;
> +
> + err = cpt_dump_task_struct(tsk, ctx);
> +
> + /* Dump task mm */
> +
> + if (!err)
> + cpt_dump_fpustate(tsk, ctx);

error checking...

> + if (!err)
> + cpt_dump_registers(tsk, ctx);

error checking...

> +
> + return err;
> +}
> --
> 1.5.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachments:

(No filename) (8.24 kB)
signature.asc (189.00 B)
Digital signature Download all attachments

2008-10-20 12:25:27

by Louis Rilling

[permalink] [raw]

Subject: Re: [PATCH 06/10] Introduce functions to dump mm

On Sat, Oct 18, 2008 at 03:11:34AM +0400, Andrey Mirkin wrote:
> Functions to dump mm struct, VMAs and mm context are added.

Again, a few little comments.

[...]

> diff --git a/checkpoint/cpt_mm.c b/checkpoint/cpt_mm.c
> new file mode 100644
> index 0000000..8a22c48
> --- /dev/null
> +++ b/checkpoint/cpt_mm.c
> @@ -0,0 +1,434 @@
> +/*
> + * Copyright (C) 2008 Parallels, Inc.
> + *
> + * Authors: Andrey Mirkin <[email protected]>
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation, version 2 of the
> + * License.
> + *
> + */
> +
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/file.h>
> +#include <linux/mm.h>
> +#include <linux/errno.h>
> +#include <linux/major.h>
> +#include <linux/mman.h>
> +#include <linux/mnt_namespace.h>
> +#include <linux/mount.h>
> +#include <linux/namei.h>
> +#include <linux/pagemap.h>
> +#include <linux/hugetlb.h>
> +#include <asm/ldt.h>
> +
> +#include "checkpoint.h"
> +#include "cpt_image.h"
> +
> +struct page_area
> +{
> + int type;
> + unsigned long start;
> + unsigned long end;
> + pgoff_t pgoff;
> + loff_t mm;
> + __u64 list[16];
> +};
> +
> +struct page_desc
> +{
> + int type;
> + pgoff_t index;
> + loff_t mm;
> + int shared;
> +};
> +
> +enum {
> + PD_ABSENT,
> + PD_COPY,
> + PD_FUNKEY,
> +};
> +
> +/* 0: page can be obtained from backstore, or still not mapped anonymous page,
> + or something else, which does not requre copy.
> + 1: page requires copy
> + 2: page requres copy but its content is zero. Quite useless.
> + 3: wp page is shared after fork(). It is to be COWed when modified.
> + 4: page is something unsupported... We copy it right now.
> + */
> +
> +static void page_get_desc(struct vm_area_struct *vma, unsigned long addr,
> + struct page_desc *pdesc, cpt_context_t * ctx)
> +{
> + struct mm_struct *mm = vma->vm_mm;
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *ptep, pte;
> + spinlock_t *ptl;
> + struct page *pg = NULL;
> + pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff;
> +
> + pdesc->index = linear_index;
> + pdesc->shared = 0;
> + pdesc->mm = CPT_NULL;
> +
> + if (vma->vm_flags & VM_IO) {
> + pdesc->type = PD_ABSENT;
> + return;
> + }
> +
> + pgd = pgd_offset(mm, addr);
> + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
> + goto out_absent;
> + pud = pud_offset(pgd, addr);
> + if (pud_none(*pud) || unlikely(pud_bad(*pud)))
> + goto out_absent;
> + pmd = pmd_offset(pud, addr);
> + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
> + goto out_absent;
> +#ifdef CONFIG_X86
> + if (pmd_huge(*pmd)) {
> + eprintk("page_huge\n");
> + goto out_unsupported;
> + }
> +#endif
> + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
> + pte = *ptep;
> + pte_unmap(ptep);
> +
> + if (pte_none(pte))
> + goto out_absent_unlock;
> +
> + if ((pg = vm_normal_page(vma, addr, pte)) == NULL) {
> + pdesc->type = PD_COPY;
> + goto out_unlock;
> + }
> +
> + get_page(pg);
> + spin_unlock(ptl);
> +
> + if (pg->mapping && !PageAnon(pg)) {
> + if (vma->vm_file == NULL) {
> + eprintk("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
> + goto out_unsupported;
> + }
> + if (vma->vm_file->f_mapping != pg->mapping) {
> + eprintk("pg->mapping!=f_mapping: %08lx %p %p\n",
> + addr, vma->vm_file->f_mapping, pg->mapping);
> + goto out_unsupported;
> + }
> + pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
> + /* Page is in backstore. For us it is like
> + * it is not present.
> + */
> + goto out_absent;
> + }
> +
> + if (PageReserved(pg)) {
> + /* Special case: ZERO_PAGE is used, when an
> + * anonymous page is accessed but not written. */
> + if (pg == ZERO_PAGE(addr)) {
> + if (pte_write(pte)) {
> + eprintk("not funny already, writable ZERO_PAGE\n");
> + goto out_unsupported;
> + }
> + /* Just copy it for now */
> + pdesc->type = PD_COPY;
> + goto out_put;
> + }
> + eprintk("reserved page %lu at %08lx\n", pg->index, addr);
> + goto out_unsupported;
> + }
> +
> + if (!pg->mapping) {
> + eprintk("page without mapping at %08lx\n", addr);
> + goto out_unsupported;
> + }
> +
> + pdesc->type = PD_COPY;
> +
> +out_put:
> + if (pg)
> + put_page(pg);
> + return;
> +
> +out_unlock:
> + spin_unlock(ptl);
> + goto out_put;
> +
> +out_absent_unlock:
> + spin_unlock(ptl);
> +
> +out_absent:
> + pdesc->type = PD_ABSENT;
> + goto out_put;
> +
> +out_unsupported:
> + pdesc->type = PD_FUNKEY;
> + goto out_put;
> +}
> +
> +static int count_vma_pages(struct vm_area_struct *vma, struct cpt_context *ctx)
> +{
> + unsigned long addr;
> + int page_num = 0;
> +
> + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> + struct page_desc pd;
> +
> + page_get_desc(vma, addr, &pd, ctx);
> +
> + if (pd.type != PD_COPY) {
> + return -EINVAL;
> + } else {
> + page_num += 1;
> + }
> +
> + }
> + return page_num;
> +}
> +
> +/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages()
> + * does not really need this thing. It just stores some page fault stats there.
> + *
> + * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages
> + * before accessing vma.
> + */
> +static int dump_pages(struct vm_area_struct *vma, unsigned long start,
> + unsigned long end, struct cpt_context *ctx)
> +{
> +#define MAX_PAGE_BATCH 16
> + struct page *pg[MAX_PAGE_BATCH];
> + int npages = (end - start)/PAGE_SIZE;
> + int count = 0;
> +
> + while (count < npages) {
> + int copy = npages - count;
> + int n;
> +
> + if (copy > MAX_PAGE_BATCH)
> + copy = MAX_PAGE_BATCH;
> + n = get_user_pages(current, vma->vm_mm, start, copy,
> + 0, 1, pg, NULL);
> + if (n == copy) {
> + int i;
> + for (i=0; i<n; i++) {
> + char *maddr = kmap(pg[i]);
> + ctx->write(maddr, PAGE_SIZE, ctx);
> + kunmap(pg[i]);

There is no error handling in this inner loop. Should be fixed imho.

> + }
> + } else {
> + eprintk("get_user_pages fault");
> + for ( ; n > 0; n--)
> + page_cache_release(pg[n-1]);
> + return -EFAULT;
> + }
> + start += n*PAGE_SIZE;
> + count += n;
> + for ( ; n > 0; n--)
> + page_cache_release(pg[n-1]);
> + }
> + return 0;
> +}
> +
> +static int dump_page_block(struct vm_area_struct *vma,
> + struct cpt_page_block *pgb,
> + struct cpt_context *ctx)
> +{
> + int err;
> + pgb->cpt_len = sizeof(*pgb) + pgb->cpt_end - pgb->cpt_start;
> + pgb->cpt_type = CPT_OBJ_PAGES;
> + pgb->cpt_hdrlen = sizeof(*pgb);
> + pgb->cpt_content = CPT_CONTENT_DATA;
> +
> + err = ctx->write(pgb, sizeof(*pgb), ctx);
> + if (!err)
> + err = dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx);
> +
> + return err;
> +}
> +
> +static int cpt_dump_dentry(struct path *p, cpt_context_t *ctx)
> +{
> + int len;
> + char *path;
> + char *buf;
> + struct cpt_object_hdr o;
> +
> + buf = (char *)__get_free_page(GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
> +
> + path = d_path(p, buf, PAGE_SIZE);
> +
> + if (IS_ERR(path)) {
> + free_page((unsigned long)buf);
> + return PTR_ERR(path);
> + }
> +
> + len = buf + PAGE_SIZE - 1 - path;
> + o.cpt_len = sizeof(o) + len + 1;
> + o.cpt_type = CPT_OBJ_NAME;
> + o.cpt_hdrlen = sizeof(o);
> + o.cpt_content = CPT_CONTENT_NAME;
> + path[len] = 0;
> +
> + ctx->write(&o, sizeof(o), ctx);
> + ctx->write(path, len + 1, ctx);

Error handling?

> + free_page((unsigned long)buf);
> +
> + return 0;
> +}
> +
> +static int dump_one_vma(struct mm_struct *mm,
> + struct vm_area_struct *vma, struct cpt_context *ctx)
> +{
> + struct cpt_vma_image *v;
> + unsigned long addr;
> + int page_num;
> + int err;
> +
> + v = kzalloc(sizeof(*v), GFP_KERNEL);
> + if (!v)
> + return -ENOMEM;
> +
> + v->cpt_len = sizeof(*v);
> + v->cpt_type = CPT_OBJ_VMA;
> + v->cpt_hdrlen = sizeof(*v);
> + v->cpt_content = CPT_CONTENT_ARRAY;
> +
> + v->cpt_start = vma->vm_start;
> + v->cpt_end = vma->vm_end;
> + v->cpt_flags = vma->vm_flags;
> + if (vma->vm_flags & VM_HUGETLB) {
> + eprintk("huge TLB VMAs are still not supported\n");
> + kfree(v);
> + return -EINVAL;
> + }
> + v->cpt_pgprot = vma->vm_page_prot.pgprot;
> + v->cpt_pgoff = vma->vm_pgoff;
> + v->cpt_file = CPT_NULL;
> + v->cpt_vma_type = CPT_VMA_TYPE_0;
> +
> + page_num = count_vma_pages(vma, ctx);
> + if (page_num < 0) {
> + kfree(v);
> + return -EINVAL;
> + }

AFAICS, since count_vma_pages only supports pages with PD_COPY, and since
get_page_desc() tags text segment pages (file-mapped and not anonymous since
not written to) as PD_ABSENT, no executable is checkpointable. So, where is the
trick? Am I completely missing something about page mapping?

> + v->cpt_page_num = page_num;
> +
> + if (vma->vm_file) {
> + v->cpt_file = 0;
> + v->cpt_vma_type = CPT_VMA_FILE;
> + }
> +
> + ctx->write(v, sizeof(*v), ctx);

Error handling?

> + kfree(v);
> +
> + if (vma->vm_file) {
> + err = cpt_dump_dentry(&vma->vm_file->f_path, ctx);
> + if (err < 0)
> + return err;
> + }
> +
> + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> + struct page_desc pd;
> + struct cpt_page_block pgb;
> +
> + page_get_desc(vma, addr, &pd, ctx);
> +
> + if (pd.type == PD_FUNKEY || pd.type == PD_ABSENT) {
> + eprintk("dump_one_vma: funkey page\n");
> + return -EINVAL;
> + }
> +
> + pgb.cpt_start = addr;
> + pgb.cpt_end = addr + PAGE_SIZE;
> + dump_page_block(vma, &pgb, ctx);

Error handling?

> + }
> +
> + return 0;
> +}
> +
> +static int cpt_dump_mm_context(struct mm_struct *mm, struct cpt_context *ctx)
> +{
> +#ifdef CONFIG_X86
> + if (mm->context.size) {
> + struct cpt_obj_bits b;
> + int size;
> +
> + mutex_lock(&mm->context.lock);
> +
> + b.cpt_type = CPT_OBJ_BITS;
> + b.cpt_len = sizeof(b);
> + b.cpt_content = CPT_CONTENT_MM_CONTEXT;
> + b.cpt_size = mm->context.size * LDT_ENTRY_SIZE;
> +
> + ctx->write(&b, sizeof(b), ctx);
> +
> + size = mm->context.size * LDT_ENTRY_SIZE;
> +
> + ctx->write(mm->context.ldt, size, ctx);

Error handling?

> +
> + mutex_unlock(&mm->context.lock);
> + }
> +#endif
> + return 0;
> +}
> +
> +int cpt_dump_mm(struct task_struct *tsk, struct cpt_context *ctx)
> +{
> + struct mm_struct *mm = tsk->mm;
> + struct cpt_mm_image *v;
> + struct vm_area_struct *vma;
> + int err;
> +
> + v = kzalloc(sizeof(*v), GFP_KERNEL);
> + if (!v)
> + return -ENOMEM;
> +
> + v->cpt_len = sizeof(*v);
> + v->cpt_type = CPT_OBJ_MM;
> + v->cpt_hdrlen = sizeof(*v);
> + v->cpt_content = CPT_CONTENT_ARRAY;
> +
> + down_read(&mm->mmap_sem);
> + v->cpt_start_code = mm->start_code;
> + v->cpt_end_code = mm->end_code;
> + v->cpt_start_data = mm->start_data;
> + v->cpt_end_data = mm->end_data;
> + v->cpt_start_brk = mm->start_brk;
> + v->cpt_brk = mm->brk;
> + v->cpt_start_stack = mm->start_stack;
> + v->cpt_start_arg = mm->arg_start;
> + v->cpt_end_arg = mm->arg_end;
> + v->cpt_start_env = mm->env_start;
> + v->cpt_end_env = mm->env_end;
> + v->cpt_def_flags = mm->def_flags;
> + v->cpt_flags = mm->flags;
> + v->cpt_map_count = mm->map_count;
> +
> + err = ctx->write(v, sizeof(*v), ctx);
> + kfree(v);
> +
> + if (err) {
> + eprintk("error during writing mm\n");
> + goto err_up;
> + }
> +
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + if ((err = dump_one_vma(mm, vma, ctx)) != 0)
> + goto err_up;
> + }
> +
> + err = cpt_dump_mm_context(mm, ctx);
> +
> +err_up:
> + up_read(&mm->mmap_sem);
> +
> + return err;
> +}
> +

[...]

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachments:

(No filename) (11.37 kB)
signature.asc (189.00 B)
Digital signature Download all attachments

2008-10-20 13:25:47

On Wed, Oct 22, 2008 at 12:49:54PM +0400, Andrey Mirkin wrote:
> On Monday 20 October 2008 13:23 Cedric Le Goater wrote:
> > Hello Andrey !
> >
> > > diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> > > index 109792b..a4848a3 100644
> > > --- a/arch/x86/kernel/entry_32.S
> > > +++ b/arch/x86/kernel/entry_32.S
> > > @@ -225,6 +225,7 @@ ENTRY(ret_from_fork)
> > > GET_THREAD_INFO(%ebp)
> > > popl %eax
> > > CFI_ADJUST_CFA_OFFSET -4
> > > +ret_from_fork_tail:
> > > pushl $0x0202 # Reset kernel eflags
> > > CFI_ADJUST_CFA_OFFSET 4
> > > popfl
> > > @@ -233,6 +234,26 @@ ENTRY(ret_from_fork)
> > > CFI_ENDPROC
> > > END(ret_from_fork)
> > >
> > > +ENTRY(i386_ret_from_resume)
> > > + CFI_STARTPROC
> > > + pushl %eax
> > > + CFI_ADJUST_CFA_OFFSET 4
> > > + call schedule_tail
> > > + GET_THREAD_INFO(%ebp)
> > > + popl %eax
> > > + CFI_ADJUST_CFA_OFFSET -4
> > > + movl (%esp), %eax
> > > + testl %eax, %eax
> > > + jz 1f
> > > + pushl %esp
> > > + call *%eax
> > > + addl $4, %esp
> > > +1:
> > > + addl $256, %esp
> > > + jmp ret_from_fork_tail
> > > + CFI_ENDPROC
> > > +END(i386_ret_from_resume)
> >
> > Could you explain why you need to do this
> >
> > call *%eax
> >
> > is it related to the freezer code ?
>
> It is not related to the freezer code actually.
> That is needed to restart syscalls. Right now I don't have a code in my
> patchset which restarts a syscall, but later I plan to add it.
> In OpenVZ checkpointing we restart syscalls if process was caught in syscall
> during checkpointing.

Do you checkpoint uninterruptible syscalls as well? If only interruptible
syscalls are checkpointed, I'd say that either this syscall uses ERESTARTSYS or
ERESTART_RESTARTBLOCK, and then signal handling code already does the trick, or
this syscall does not restart itself when interrupted, and well, this is life,
userspace just sees -EINTR, which is allowed by the syscall spec.
Actually this is how we checkpoint/migrate tasks in interruptible syscalls in
Kerrighed and this works.

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachments:

(No filename) (2.17 kB)
signature.asc (189.00 B)
Digital signature Download all attachments

2008-10-22 10:07:33

by Greg Kurz

[permalink] [raw]

Subject: Re: [Devel] Re: [PATCH 08/10] Introduce functions to restart a process

On Wed, 2008-10-22 at 11:25 +0200, Louis Rilling wrote:
> Do you checkpoint uninterruptible syscalls as well? If only interruptible
> syscalls are checkpointed, I'd say that either this syscall uses ERESTARTSYS or
> ERESTART_RESTARTBLOCK, and then signal handling code already does the trick, or
> this syscall does not restart itself when interrupted, and well, this is life,
> userspace just sees -EINTR, which is allowed by the syscall spec.
> Actually this is how we checkpoint/migrate tasks in interruptible syscalls in
> Kerrighed and this works.
>
> Louis
>

I don't know Kerrighed internals but I understand you perform checkpoint
with a signal handler. Right ? This approach has a huge benefit: the
signal handling code do all the arch dependant stuff to save registers
in user memory.

--
Gregory Kurz [email protected]
Software Engineer @ IBM/Meiosys http://www.ibm.com
Tel +33 (0)534 638 479 Fax +33 (0)561 400 420

"Anarchy is about taking complete responsibility for yourself."
Alan Moore.

2008-10-22 10:14:26

by Andrey Mirkin

[permalink] [raw]

Subject: Re: [Devel] Re: [PATCH 08/10] Introduce functions to restart a process

On Wednesday 22 October 2008 13:25 Louis Rilling wrote:
> On Wed, Oct 22, 2008 at 12:49:54PM +0400, Andrey Mirkin wrote:
> > On Monday 20 October 2008 13:23 Cedric Le Goater wrote:
> > > Hello Andrey !
> > >
> > > > diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> > > > index 109792b..a4848a3 100644
> > > > --- a/arch/x86/kernel/entry_32.S
> > > > +++ b/arch/x86/kernel/entry_32.S
> > > > @@ -225,6 +225,7 @@ ENTRY(ret_from_fork)
> > > > GET_THREAD_INFO(%ebp)
> > > > popl %eax
> > > > CFI_ADJUST_CFA_OFFSET -4
> > > > +ret_from_fork_tail:
> > > > pushl $0x0202 # Reset kernel eflags
> > > > CFI_ADJUST_CFA_OFFSET 4
> > > > popfl
> > > > @@ -233,6 +234,26 @@ ENTRY(ret_from_fork)
> > > > CFI_ENDPROC
> > > > END(ret_from_fork)
> > > >
> > > > +ENTRY(i386_ret_from_resume)
> > > > + CFI_STARTPROC
> > > > + pushl %eax
> > > > + CFI_ADJUST_CFA_OFFSET 4
> > > > + call schedule_tail
> > > > + GET_THREAD_INFO(%ebp)
> > > > + popl %eax
> > > > + CFI_ADJUST_CFA_OFFSET -4
> > > > + movl (%esp), %eax
> > > > + testl %eax, %eax
> > > > + jz 1f
> > > > + pushl %esp
> > > > + call *%eax
> > > > + addl $4, %esp
> > > > +1:
> > > > + addl $256, %esp
> > > > + jmp ret_from_fork_tail
> > > > + CFI_ENDPROC
> > > > +END(i386_ret_from_resume)
> > >
> > > Could you explain why you need to do this
> > >
> > > call *%eax
> > >
> > > is it related to the freezer code ?
> >
> > It is not related to the freezer code actually.
> > That is needed to restart syscalls. Right now I don't have a code in my
> > patchset which restarts a syscall, but later I plan to add it.
> > In OpenVZ checkpointing we restart syscalls if process was caught in
> > syscall during checkpointing.
>
> Do you checkpoint uninterruptible syscalls as well? If only interruptible
> syscalls are checkpointed, I'd say that either this syscall uses
> ERESTARTSYS or ERESTART_RESTARTBLOCK, and then signal handling code already
> does the trick, or this syscall does not restart itself when interrupted,
> and well, this is life, userspace just sees -EINTR, which is allowed by the
> syscall spec.
> Actually this is how we checkpoint/migrate tasks in interruptible syscalls
> in Kerrighed and this works.

We checkpoint only interruptible syscalls. Some syscalls do not restart
themself, that is why after restarting a process we restart syscall to
complete it.

Andrey

2008-10-22 10:44:58

by Louis Rilling

[permalink] [raw]

Subject: Re: [Devel] Re: [PATCH 08/10] Introduce functions to restart a process

On Wed, Oct 22, 2008 at 12:06:19PM +0200, Greg Kurz wrote:
> On Wed, 2008-10-22 at 11:25 +0200, Louis Rilling wrote:
> > Do you checkpoint uninterruptible syscalls as well? If only interruptible
> > syscalls are checkpointed, I'd say that either this syscall uses ERESTARTSYS or
> > ERESTART_RESTARTBLOCK, and then signal handling code already does the trick, or
> > this syscall does not restart itself when interrupted, and well, this is life,
> > userspace just sees -EINTR, which is allowed by the syscall spec.
> > Actually this is how we checkpoint/migrate tasks in interruptible syscalls in
> > Kerrighed and this works.
> >
> > Louis
> >
>
> I don't know Kerrighed internals but I understand you perform checkpoint
> with a signal handler. Right ?

Right. This is an kernel-internal-only signal, so all signals remain available
for userspace.

> This approach has a huge benefit: the
> signal handling code do all the arch dependant stuff to save registers
> in user memory.

Hm, I'm not sure to understand what you mean here. We just rely on arch code
that jumps to signal handling to correctly setup struct pt_regs, which is then
passed to the checkpoint code. So yes, userspace registers are mostly saved by
existing arch code. But in x86-64 for instance, segment registers still need to
be saved by the checkpoint code (a bit like copy_thread() does), and I don't
know arch-independent functions doing this.

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachments:

(No filename) (1.57 kB)
signature.asc (189.00 B)
Digital signature Download all attachments

2008-10-22 10:46:46

On Monday 27 October 2008 18:58 Oren Laadan wrote:
> Andrey Mirkin wrote:
> > The whole tree of processes can be checkpointed and restarted now.
> > Shared objects are not supported yet.
> >
> > Signed-off-by: Andrey Mirkin <[email protected]>
> > ---
> > checkpoint/cpt_image.h | 2 +
> > checkpoint/cpt_process.c | 24 +++++++++++++
> > checkpoint/rst_process.c | 85
> > +++++++++++++++++++++++++++------------------- 3 files changed, 76
> > insertions(+), 35 deletions(-)
> >
> > diff --git a/checkpoint/cpt_image.h b/checkpoint/cpt_image.h
> > index e1fb483..f370df2 100644
> > --- a/checkpoint/cpt_image.h
> > +++ b/checkpoint/cpt_image.h
> > @@ -128,6 +128,8 @@ struct cpt_task_image {
> > __u64 cpt_nivcsw;
> > __u64 cpt_min_flt;
> > __u64 cpt_maj_flt;
> > + __u32 cpt_children_num;
> > + __u32 cpt_pad;
> > } __attribute__ ((aligned (8)));
> >
> > struct cpt_mm_image {
> > diff --git a/checkpoint/cpt_process.c b/checkpoint/cpt_process.c
> > index 1f7a54b..d73ec3c 100644
> > --- a/checkpoint/cpt_process.c
> > +++ b/checkpoint/cpt_process.c
> > @@ -40,6 +40,19 @@ static unsigned int encode_task_flags(unsigned int
> > task_flags)
> >
> > }
> >
> > +static int cpt_count_children(struct task_struct *tsk, struct
> > cpt_context *ctx) +{
> > + int num = 0;
> > + struct task_struct *child;
> > +
> > + list_for_each_entry(child, &tsk->children, sibling) {
> > + if (child->parent != tsk)
> > + continue;
> > + num++;
> > + }
> > + return num;
> > +}
> > +
>
> I noticed that don't take the appropriate locks when browsing through
> tasks lists (siblings, children, global list). Although I realize that
> the container should be frozen at this time, I keep wondering if this
> is indeed always safe.
You right here. We need to take tasklist_lock to be sure that everything will
be consistent.

> For instance, are you protected against an OOM killer that might just
> occur uninvited and kill one of those tasks ?

OOM killer can't kill one of those tasks as all of them should be frozen at
that time and be in uninterruptible state. So, we can just do not think about
OOM at that time. But anyway you right and we need locking around browsing
tasks list.

> Can the administrator force an un-freeze of the container ? Or perhaps
> some error condition if the kernel cause that ?

The main idea is that context should be protected with a mutex and only one
process can do some activity (freeze, dump, unfreeze, kill) with a container.
Right now it is not implemented at all, but in future it will be added.

Andrey

> > int cpt_dump_task_struct(struct task_struct *tsk, struct cpt_context
> > *ctx) {
> > struct cpt_task_image *t;
> > @@ -102,6 +115,7 @@ int cpt_dump_task_struct(struct task_struct *tsk,
> > struct cpt_context *ctx) t->cpt_egid = tsk->egid;
> > t->cpt_sgid = tsk->sgid;
> > t->cpt_fsgid = tsk->fsgid;
> > + t->cpt_children_num = cpt_count_children(tsk, ctx);
> >
> > err = ctx->write(t, sizeof(*t), ctx);
> >
> > @@ -231,6 +245,16 @@ int cpt_dump_task(struct task_struct *tsk, struct
> > cpt_context *ctx) err = cpt_dump_fpustate(tsk, ctx);
> > if (!err)
> > err = cpt_dump_registers(tsk, ctx);
> > + if (!err) {
> > + struct task_struct *child;
> > + list_for_each_entry(child, &tsk->children, sibling) {
> > + if (child->parent != tsk)
> > + continue;
> > + err = cpt_dump_task(child, ctx);
> > + if (err)
> > + break;
>
> Here too.
>
> [...]
>
> Oren.
>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
> _______________________________________________
> Devel mailing list
> [email protected]
> https://openvz.org/mailman/listinfo/devel

2008-10-30 16:00:47

by Oren Laadan

[permalink] [raw]

Subject: Re: [Devel] Re: [PATCH 08/10] Introduce functions to restart a process

Andrey Mirkin wrote:
> On Sunday 26 October 2008 01:10 Oren Laadan wrote:
>> Andrey Mirkin wrote:
>>> On Thursday 23 October 2008 17:57 Dave Hansen wrote:
>>>> On Thu, 2008-10-23 at 13:00 +0400, Andrey Mirkin wrote:
>>>>>>>>> It is not related to the freezer code actually.
>>>>>>>>> That is needed to restart syscalls. Right now I don't have a code
>>>>>>>>> in my patchset which restarts a syscall, but later I plan to add
>>>>>>>>> it. In OpenVZ checkpointing we restart syscalls if process was
>>>>>>>>> caught in syscall during checkpointing.
>>>>>>>> Do you checkpoint uninterruptible syscalls as well? If only
>>>>>>>> interruptible syscalls are checkpointed, I'd say that either this
>>>>>>>> syscall uses ERESTARTSYS or ERESTART_RESTARTBLOCK, and then signal
>>>>>>>> handling code already does the trick, or this syscall does not
>>>>>>>> restart itself when interrupted, and well, this is life, userspace
>>>>>>>> just sees -EINTR, which is allowed by the syscall spec.
>>>>>>>> Actually this is how we checkpoint/migrate tasks in interruptible
>>>>>>>> syscalls in Kerrighed and this works.
>>>>>>> We checkpoint only interruptible syscalls. Some syscalls do not
>>>>>>> restart themself, that is why after restarting a process we restart
>>>>>>> syscall to complete it.
>>>>>> Can you please elaborate on this ? I don't recall having had issues
>>>>>> with that.
>>>>> Right now in 2.6.18 kernel we restarts in such a way pause,
>>>>> rt_sigtimedwait and futex syscalls. Recently futex syscall was reworked
>>>>> and we will not need such hooks for it.
>>>> Could you elaborate on this a bit?
>>>>
>>>> If the futex syscall was reworked, perhaps we can do the same for
>>>> rt_sigtimedwait() and get rid of this code completely.
>>> Well, we can try to rework rt_sigtimedwait(), but we will still need this
>>> code in the future to restart pause syscall from kernel without returning
>>> to user space. Also this code will be needed to restore some complex
>>> states. As concerns pause syscall I have already written to Louis about
>>> the problem we are trying to solve with this code. There is a gap when
>>> process will be in user space just before entering syscall again. At this
>>> time a signal can be delivered to process and it even can be handled. So,
>>> we will miss a signal which must interrupt pause syscall.
>> I'm not convinced that you a real race exists, and even if it does, I'm not
>> convinced that hacking the assembly entry/exit code is the best way to do
>> it.
>
> Well, as I already told pause() syscall is is not only one case why we need to
> do some additional job in that place.
>
>> Let me explain:
>>
>> You are concerned about a race in which a signal is delivered to a task
>> that resumes from restart to user space and is about to (re)invoke
>> 'pause()' (because the restart so arranged its EIP and registers).
>>
>> This almost always means that the user code is buggy and relies on specific
>> scheduling, because you can usually find a scheduling (without the C/R)
>> where the intended recipient of the signal was delayed and only calls
>> pause() after the signal is delivered.
>>
>> For instance, if the sequence of events is:
>> A calls pause() -> checkpoint -> restart ->
>> B signals A -> A calls pause() (after restart),
>> then the following sequence is possible(*) without C/R:
>> B signals A -> A calls pause()
>> because normally B cannot assume anything about when A is actually,
>> really, is suspended (which means the programmer did an imperfect job).
>
> You right here. Both sequences are possible in theory. You will be surprised
> but in practice we found out that probability to miss a signal in case of C/R
> is much higher then during ordinary execution.

The point is that "missing" a signal because of freeze/thaw (or stop/cont)
is perfectly acceptable behavior. It is supposed to work that way. So I
argue that we don't need a workaround.

>
>> I said "almost always" and "usually", because there is one case where the
>> alternative schedule: task B could, prior to sending the signal, "ensure"
>> that task A is already sleeping within the 'pause()' syscall. While this
>> is possible, it is definitely unusual, and in fact I never code that does
>> that. And what if the sysadmin send SIGSTOP followed by SIGCONT ? In
>> short, such code is simply broken.
>>
>> More importantly, if you think about the operation and semantics of the
>> freezer cgroup - similar behavior is to be expected when you freeze and
>> then thaw a container.
>>
>> Specifically, when you freeze the container that has a task in sys_pause(),
>> then that task will abort the syscall become frozen. As soon as it becomes
>> unfrozen, it will return to user space (with the EIP "rewinded") only to
>> re-invoke the syscall. So the same "race" remains even if you only freeze
>> and then thaw, regardless of C/R.
>
> Exactly. But during freeze/unfreeze probability to catch such situation is
> very low. In our tests we were tried to checkpoint/restart LTP tests. And
> this "race" were triggered during restart almost in 100% of tests.

Why is it the case ?

At the end of restart, the container remains frozen until you unfreeze it.
So c/r effectively becomes freeze/unfreeze, except - possible - for page
fault that are more likely to happen when thawing following the restart
(and these may slow down the application and allow a signal to "slip in").

If it's nearly 100% of the tests, that it should be easily reproducible
with merely freeze/thaw pairs, no ?

Still, arguing that LTP "breaks" here is like arguing that LTP "breaks"
if we were to run it while sending SIGSTOP/SIGCONT to its processes...

>
>> Moreover, I argue that basically when you return from a sys_restart(), the
>> entire container should, by default, remain in frozen state - just like it
>> is with sys_checkpoint(). An explicit thaw will make the container resume
>> execution.
>
> No doubt. That is how we do in OpenVZ, after restart the container remains
> frozen. And we need to thaw it to resume its execution.
>
>> Therefore, there are two options: the first is to decide that this behavior
>> - going back to user space to re-invoke the syscall - is valid. In this
>> case you don't need a special hack for returning from sys_restart(). The
>> second option is to decide that it is broken, in which case you need to
>> also fix the freezer code. Personally, I think that this behavior is valid
>> and need not be fixed.
>
> I still believe that we need to fix such behaviour during restart as in
> practice it is very easy to trigger it.

did you see any problem outside LTP ? I never saw any such problems.

>
>> Finally, even if you do want to fix the behavior for this pathologic case,
>> I don't see why you'd want to do it in this manner. Instead, you can add a
>> simple test prior to returning from sys_restart(), something like this:
>>
>> ...
>> /* almost done: now handle special cases: */
>> if (our last syscall == __NR_pause) {
do_freeze();

>> ret = sys_pause();
>> } else if (our last syscall == __NR_futex) {
>> do some stuff;
do_freeze();

>> ret = sys_futex();
>> } else {
>> ret = what-we-want-to-return
>> }
>> /* finally, return to user space */
>> return ret;
>> }
>
> This only works if we do not want to stay in frozen state after restart. Or am
> I missed something?

Sure: see addition above.

>
>> I'm not quite know what other "complex states" you refer to; but I wonder
>> whether that code "needed to restore some complex states" could not be
>> implemented along the same idea.
>
> In the same manner we are restoring for instance ptrace.

Yes, I mentioned that in the past. Can be addressed in the same manner.

>
>> The upside is clear: the code is less obscure, simple to debug, and not
>> architecture-dependent. (hehe .. it even runs faster because it saves a
>> whole kernel->user->kernel switch, what do you know !).

> In our case we also do not need a switch and the code actually not very
> complicated.

Fact is that people wondered what was going on there.

I prefer the arch-independent way, because it is, well, arch-independent,
and because it makes the logic and the exception obvious to the reader,
and it is easily extensible to handle additional special cases (like
ptrace), and makes maintenance easier.

Do you object to doing it this way ?

Oren.