Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755885AbYHTT1Z (ORCPT ); Wed, 20 Aug 2008 15:27:25 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1755044AbYHTT0L (ORCPT ); Wed, 20 Aug 2008 15:26:11 -0400 Received: from e2.ny.us.ibm.com ([32.97.182.142]:48278 "EHLO e2.ny.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754599AbYHTT0I (ORCPT ); Wed, 20 Aug 2008 15:26:08 -0400 Subject: [RFC v2][PATCH 4/9] checkpoint/restart: memory management To: arnd@arndb.de Cc: orenl@cs.columbia.edu, jeremy@goop.org, containers@lists.linux-foundation.org, linux-kernel@vger.kernel.org, Dave Hansen From: Dave Hansen Date: Wed, 20 Aug 2008 12:26:02 -0700 References: <20080820192557.98788FAB@nimitz> In-Reply-To: <20080820192557.98788FAB@nimitz> Message-Id: <20080820192602.F5C7CCAD@nimitz> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 28693 Lines: 1052 For each vma, there is a 'struct cr_vma'; if the vma is file-mapped, it will be followed by the file name. The cr_vma->npages will tell how many pages were dumped for this vma. Then it will be followed by the actual data: first a dump of the addresses of all dumped pages (npages entries) followed by a dump of the contents of all dumped pages (npages pages). Then will come the next vma and so on. I guess I could also separate out the x86-specific bits here, but they're pretty small, comparatively. Signed-off-by: Oren Laadan --- oren-cr.git-dave/arch/x86/kernel/ldt.c | 2 oren-cr.git-dave/checkpoint/Makefile | 2 oren-cr.git-dave/checkpoint/ckpt_arch.h | 2 oren-cr.git-dave/checkpoint/ckpt_hdr.h | 21 + oren-cr.git-dave/checkpoint/ckpt_mem.c | 392 ++++++++++++++++++++++++++++++++ oren-cr.git-dave/checkpoint/ckpt_mem.h | 32 ++ oren-cr.git-dave/checkpoint/rstr_mem.c | 354 ++++++++++++++++++++++++++++ oren-cr.git-dave/checkpoint/sys.c | 13 - oren-cr.git-dave/checkpoint/x86.c | 83 ++++++ oren-cr.git-dave/include/asm-x86/ckpt.h | 5 oren-cr.git-dave/include/asm-x86/desc.h | 3 11 files changed, 895 insertions(+), 14 deletions(-) diff -puN arch/x86/kernel/ldt.c~0005-checkpoint-restart-memory-management arch/x86/kernel/ldt.c --- oren-cr.git/arch/x86/kernel/ldt.c~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/arch/x86/kernel/ldt.c 2008-08-20 12:12:49.000000000 -0700 @@ -181,7 +181,7 @@ static int read_default_ldt(void __user return bytecount; } -static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct desc_struct ldt; diff -puN checkpoint/ckpt_arch.h~0005-checkpoint-restart-memory-management checkpoint/ckpt_arch.h --- oren-cr.git/checkpoint/ckpt_arch.h~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/checkpoint/ckpt_arch.h 2008-08-20 12:12:49.000000000 -0700 @@ -4,3 +4,5 @@ int cr_write_thread(struct cr_ctx *ctx, int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t); int cr_read_thread(struct cr_ctx *ctx); int cr_read_cpu(struct cr_ctx *ctx); +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm); +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm); diff -puN checkpoint/ckpt_hdr.h~0005-checkpoint-restart-memory-management checkpoint/ckpt_hdr.h --- oren-cr.git/checkpoint/ckpt_hdr.h~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/checkpoint/ckpt_hdr.h 2008-08-20 12:12:49.000000000 -0700 @@ -67,3 +67,24 @@ struct cr_hdr_task { }; + +struct cr_hdr_mm { + __u32 tag; /* sharing identifier */ + __u64 start_code, end_code, start_data, end_data; + __u64 start_brk, brk, start_stack; + __u64 arg_start, arg_end, env_start, env_end; + __s16 map_count; +}; + +struct cr_hdr_vma { + __u32 how; + + __u64 vm_start; + __u64 vm_end; + __u64 vm_page_prot; + __u64 vm_flags; + __u64 vm_pgoff; + + __s16 npages; + __s16 namelen; +}; diff -puN /dev/null checkpoint/ckpt_mem.c --- /dev/null 2008-04-22 10:49:52.000000000 -0700 +++ oren-cr.git-dave/checkpoint/ckpt_mem.c 2008-08-20 12:12:49.000000000 -0700 @@ -0,0 +1,392 @@ +/* + * Checkpoint memory contents + * + * Copyright (C) 2008 Oren Laadan + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include +#include +#include +#include +#include + +#include "ckpt.h" +#include "ckpt_hdr.h" +#include "ckpt_arch.h" +#include "ckpt_mem.h" + +/* + * utilities to alloc, free, and handle 'struct cr_pgarr' + * (common to ckpt_mem.c and rstr_mem.c) + */ + +#define CR_ORDER_PGARR 0 +#define CR_PGARR_TOTAL ((PAGE_SIZE << CR_ORDER_PGARR) / sizeof(void *)) + +/* release pages referenced by a page-array */ +void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr) +{ + int n; + + /* only checkpoint keeps references to pages */ + if (ctx->flags & CR_CTX_CKPT) { + pr_debug("release pages (nused %d)\n", pgarr->nused); + for (n = pgarr->nused; n--; ) + page_cache_release(pgarr->pages[n]); + } + pgarr->nused = 0; + pgarr->nleft = CR_PGARR_TOTAL; +} + +/* release pages referenced by chain of page-arrays */ +void cr_pgarr_release(struct cr_ctx *ctx) +{ + struct cr_pgarr *pgarr; + + for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) + _cr_pgarr_release(ctx, pgarr); +} + +/* free a chain of page-arrays */ +void cr_pgarr_free(struct cr_ctx *ctx) +{ + struct cr_pgarr *pgarr, *pgnxt; + + for (pgarr = ctx->pgarr; pgarr; pgarr = pgnxt) { + _cr_pgarr_release(ctx, pgarr); + free_pages((unsigned long) ctx->pgarr->addrs, CR_ORDER_PGARR); + free_pages((unsigned long) ctx->pgarr->pages, CR_ORDER_PGARR); + pgnxt = pgarr->next; + kfree(pgarr); + } +} + +/* allocate and add a new page-array to chain */ +struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew) +{ + struct cr_pgarr *pgarr = ctx->pgcur; + + if (pgarr && pgarr->next) { + ctx->pgcur = pgarr->next; + return pgarr->next; + } + + pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL); + if (pgarr) { + pgarr->nused = 0; + pgarr->nleft = CR_PGARR_TOTAL; + pgarr->addrs = (unsigned long *) + __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR); + pgarr->pages = (struct page **) + __get_free_pages(GFP_KERNEL, CR_ORDER_PGARR); + if (likely(pgarr->addrs && pgarr->pages)) { + *pgnew = pgarr; + ctx->pgcur = pgarr; + return pgarr; + } else if (pgarr->addrs) + free_pages((unsigned long) pgarr->addrs, + CR_ORDER_PGARR); + kfree(pgarr); + } + + return NULL; +} + +/* return current page-array (and allocate if needed) */ +struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx) +{ + struct cr_pgarr *pgarr = ctx->pgcur; + + if (unlikely(!pgarr->nleft)) + pgarr = cr_pgarr_alloc(ctx, &pgarr->next); + return pgarr; +} + +/* + * Checkpoint is outside the context of the checkpointee, so one cannot + * simply read pages from user-space. Instead, we scan the address space + * of the target to cherry-pick pages of interest. Selected pages are + * enlisted in a page-array chain (attached to the checkpoint context). + * To save their contents, each page is mapped to kernel memory and then + * dumped to the file descriptor. + */ + +/** + * cr_vma_fill_pgarr - fill a page-array with addr/page tuples for a vma + * @ctx - checkpoint context + * @pgarr - page-array to fill + * @vma - vma to scan + * @start - start address (updated) + */ +static int cr_vma_fill_pgarr(struct cr_ctx *ctx, struct cr_pgarr *pgarr, + struct vm_area_struct *vma, unsigned long *start) +{ + unsigned long end = vma->vm_end; + unsigned long addr = *start; + struct page **pagep; + unsigned long *addrp; + int cow, nr, ret = 0; + + nr = pgarr->nleft; + pagep = &pgarr->pages[pgarr->nused]; + addrp = &pgarr->addrs[pgarr->nused]; + cow = !!vma->vm_file; + + while (addr < end) { + struct page *page; + + /* simplified version of get_user_pages(): already have vma, + * only need FOLL_TOUCH, and (for now) ignore fault stats */ + + cond_resched(); + while (!(page = follow_page(vma, addr, FOLL_TOUCH))) { + ret = handle_mm_fault(vma->vm_mm, vma, addr, 0); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + ret = -ENOMEM; + else if (ret & VM_FAULT_SIGBUS) + ret = -EFAULT; + else + BUG(); + break; + } + cond_resched(); + } + + if (IS_ERR(page)) { + ret = PTR_ERR(page); + break; + } + + if (page == ZERO_PAGE(0)) + page = NULL; /* zero page: ignore */ + else if (cow && page_mapping(page) != NULL) + page = NULL; /* clean cow: ignore */ + else { + get_page(page); + *(addrp++) = addr; + *(pagep++) = page; + if (--nr == 0) { + addr += PAGE_SIZE; + break; + } + } + + addr += PAGE_SIZE; + } + + if (unlikely(ret < 0)) { + nr = pgarr->nleft - nr; + while (nr--) + page_cache_release(*(--pagep)); + return ret; + } + + *start = addr; + return pgarr->nleft - nr; +} + +/** + * cr_vma_scan_pages - scan vma for pages that will need to be dumped + * @ctx - checkpoint context + * @vma - vma to scan + * + * a list of addr/page tuples is kept in ctx->pgarr page-array chain + */ +static int cr_vma_scan_pages(struct cr_ctx *ctx, struct vm_area_struct *vma) +{ + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + struct cr_pgarr *pgarr; + int nr, total = 0; + + while (addr < end) { + pgarr = cr_pgarr_prep(ctx); + if (!pgarr) + return -ENOMEM; + nr = cr_vma_fill_pgarr(ctx, pgarr, vma, &addr); + if (nr < 0) + return nr; + pgarr->nleft -= nr; + pgarr->nused += nr; + total += nr; + } + + pr_debug("total %d\n", total); + return total; +} + +/** + * cr_vma_dump_pages - dump pages listed in the ctx page-array chain + * @ctx - checkpoint context + * @total - total number of pages + */ +static int cr_vma_dump_pages(struct cr_ctx *ctx, int total) +{ + struct cr_pgarr *pgarr; + int ret; + + if (!total) + return 0; + + for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) { + ret = cr_kwrite(ctx, pgarr->addrs, + pgarr->nused * sizeof(*pgarr->addrs)); + if (ret < 0) + return ret; + } + + for (pgarr = ctx->pgarr; pgarr; pgarr = pgarr->next) { + struct page **pages = pgarr->pages; + int nr = pgarr->nused; + void *ptr; + + while (nr--) { + ptr = kmap(*pages); + ret = cr_kwrite(ctx, ptr, PAGE_SIZE); + kunmap(*pages); + if (ret < 0) + return ret; + pages++; + } + } + + return total; +} + +static int cr_write_vma(struct cr_ctx *ctx, struct vm_area_struct *vma) +{ + struct cr_hdr h; + struct cr_hdr_vma *hh = ctx->tbuf; + char *fname = NULL; + int how, nr, ret; + + h.type = CR_HDR_VMA; + h.len = sizeof(*hh); + h.id = ctx->pid; + + hh->vm_start = vma->vm_start; + hh->vm_end = vma->vm_end; + hh->vm_page_prot = vma->vm_page_prot.pgprot; + hh->vm_flags = vma->vm_flags; + hh->vm_pgoff = vma->vm_pgoff; + + if (vma->vm_flags & (VM_SHARED | VM_IO | VM_HUGETLB | VM_NONLINEAR)) { + printk(KERN_WARNING "CR: unknown VMA %#lx\n", vma->vm_flags); + return -ETXTBSY; + } + + /* by default assume anon memory */ + how = CR_VMA_ANON; + + /* if there is a backing file, assume private-mapped */ + /* (NEED: check if the file is unlinked) */ + if (vma->vm_file) { + nr = PAGE_SIZE; + fname = cr_get_fname(&vma->vm_file->f_path, + ctx->vfsroot, ctx->tbuf, &nr); + if (IS_ERR(fname)) + return PTR_ERR(fname); + hh->namelen = nr; + how = CR_VMA_FILE; + } else + hh->namelen = 0; + + hh->how = how; + + /* + * it seems redundant now, but we do it in 3 steps for because: + * first, the logic is simpler when we how many pages before + * dumping them; second, a future optimization will defer the + * writeout (dump, and free) to a later step; in which case all + * the pages to be dumped will be aggregated on the checkpoint ctx + */ + + /* (1) scan: scan through the PTEs of the vma, both to count the + * pages to dump, and make those pages COW. keep the list of pages + * (and a reference to each page) on the checkpoint ctx */ + nr = cr_vma_scan_pages(ctx, vma); + if (nr < 0) { + cr_put_fname(ctx->tbuf, fname, PAGE_SIZE); + return nr; + } + + hh->npages = nr; + ret = cr_write_obj(ctx, &h, hh); + + if (!ret && hh->namelen) + ret = cr_write_str(ctx, fname, hh->namelen); + + cr_put_fname(ctx->tbuf, fname, PAGE_SIZE); + + if (ret < 0) + return ret; + + /* (2) dump: write out the addresses of all pages in the list (on + * the checkpoint ctx) followed by the contents of all pages */ + ret = cr_vma_dump_pages(ctx, nr); + + /* (3) free: free the extra references to the pages in the list */ + cr_pgarr_release(ctx); + + return ret; +} + +int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t) +{ + struct cr_hdr h; + struct cr_hdr_mm *hh = ctx->tbuf; + struct mm_struct *mm; + struct vm_area_struct *vma; + int ret; + + h.type = CR_HDR_MM; + h.len = sizeof(*hh); + h.id = ctx->pid; + + mm = get_task_mm(t); + + hh->tag = 1; /* non-zero will mean first time encounter */ + + hh->start_code = mm->start_code; + hh->end_code = mm->end_code; + hh->start_data = mm->start_data; + hh->end_data = mm->end_data; + hh->start_brk = mm->start_brk; + hh->brk = mm->brk; + hh->start_stack = mm->start_stack; + hh->arg_start = mm->arg_start; + hh->arg_end = mm->arg_end; + hh->env_start = mm->env_start; + hh->env_end = mm->env_end; + + hh->map_count = mm->map_count; + + /* FIX: need also mm->flags */ + + ret = cr_write_obj(ctx, &h, hh); + if (ret < 0) + goto out; + + /* write the vma's */ + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + ret = cr_write_vma(ctx, vma); + if (ret < 0) + break; + } + up_read(&mm->mmap_sem); + + if (ret < 0) + goto out; + + ret = cr_write_mm_context(ctx, mm); + + out: + mmput(mm); + return ret; +} diff -puN /dev/null checkpoint/ckpt_mem.h --- /dev/null 2008-04-22 10:49:52.000000000 -0700 +++ oren-cr.git-dave/checkpoint/ckpt_mem.h 2008-08-20 12:12:49.000000000 -0700 @@ -0,0 +1,32 @@ +/* + * Generic container checkpoint-restart + * + * Copyright (C) 2008 Oren Laadan + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include + +/* page-array chains: each pgarr hols a list of tuples */ +struct cr_pgarr { + unsigned long *addrs; + struct page **pages; + struct cr_pgarr *next; + unsigned short nleft; + unsigned short nused; +}; + +/* vma subtypes */ +enum { + CR_VMA_ANON = 1, + CR_VMA_FILE +}; + +extern void _cr_pgarr_release(struct cr_ctx *ctx, struct cr_pgarr *pgarr); +extern void cr_pgarr_release(struct cr_ctx *ctx); +extern void cr_pgarr_free(struct cr_ctx *ctx); +extern struct cr_pgarr *cr_pgarr_alloc(struct cr_ctx *ctx, struct cr_pgarr **pgnew); +extern struct cr_pgarr *cr_pgarr_prep(struct cr_ctx *ctx); diff -puN checkpoint/Makefile~0005-checkpoint-restart-memory-management checkpoint/Makefile --- oren-cr.git/checkpoint/Makefile~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/checkpoint/Makefile 2008-08-20 12:12:49.000000000 -0700 @@ -1,2 +1,2 @@ -obj-y += sys.o checkpoint.o restart.o +obj-y += sys.o checkpoint.o restart.o ckpt_mem.o rstr_mem.o obj-$(CONFIG_X86) += x86.o diff -puN /dev/null checkpoint/rstr_mem.c --- /dev/null 2008-04-22 10:49:52.000000000 -0700 +++ oren-cr.git-dave/checkpoint/rstr_mem.c 2008-08-20 12:12:49.000000000 -0700 @@ -0,0 +1,354 @@ +/* + * Restart memory contents + * + * Copyright (C) 2008 Oren Laadan + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ckpt.h" +#include "ckpt_arch.h" +#include "ckpt_hdr.h" +#include "ckpt_mem.h" + +/* + * Unlike checkpoint, restart is executed in the context of each restarting + * process: vma regions are restored via a call to mmap(), and the data is + * read in directly to the address space of the current process + */ + +/** + * cr_vma_read_pages_addr - read addresses of pages to page-array chain + * @ctx - restart context + * @npages - number of pages + */ +static int cr_vma_read_pages_addr(struct cr_ctx *ctx, int npages) +{ + struct cr_pgarr *pgarr; + int nr, ret; + + while (npages) { + pgarr = cr_pgarr_prep(ctx); + if (!pgarr) + return -ENOMEM; + nr = min(npages, (int) pgarr->nleft); + ret = cr_kread(ctx, pgarr->addrs, nr * sizeof(unsigned long)); + if (ret < 0) + return ret; + pgarr->nleft -= nr; + pgarr->nused += nr; + npages -= nr; + } + return 0; +} + +/** + * cr_vma_read_pages_data - read in data of pages in page-array chain + * @ctx - restart context + * @npages - number of pages + */ +static int cr_vma_read_pages_data(struct cr_ctx *ctx, int npages) +{ + struct cr_pgarr *pgarr; + unsigned long *addrs; + int nr, ret; + + for (pgarr = ctx->pgarr; npages; pgarr = pgarr->next) { + addrs = pgarr->addrs; + nr = pgarr->nused; + npages -= nr; + while (nr--) { + ret = cr_uread(ctx, (void *) *(addrs++), PAGE_SIZE); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* change the protection of an address range to be writable/non-writable. + * this is useful when restoring the memory of a read-only vma */ +static int cr_vma_writable(struct mm_struct *mm, unsigned long start, + unsigned long end, int writable) +{ + struct vm_area_struct *vma, *prev; + unsigned long flags = 0; + int ret = -EINVAL; + + pr_debug("vma %#lx-%#lx writable %d\n", start, end, writable); + + down_write(&mm->mmap_sem); + vma = find_vma_prev(mm, start, &prev); + if (unlikely(!vma || vma->vm_start > end || vma->vm_end < start)) + goto out; + if (writable && !(vma->vm_flags & VM_WRITE)) + flags = vma->vm_flags | VM_WRITE; + else if (!writable && (vma->vm_flags & VM_WRITE)) + flags = vma->vm_flags & ~VM_WRITE; + pr_debug("flags %#lx\n", flags); + if (flags) + ret = mprotect_fixup(vma, &prev, vma->vm_start, + vma->vm_end, flags); + out: + up_write(&mm->mmap_sem); + return ret; +} + +/** + * cr_vma_read_pages - read in pages for to restore a vma + * @ctx - restart context + * @cr_vma - vma descriptor from restart + */ +static int cr_vma_read_pages(struct cr_ctx *ctx, struct cr_hdr_vma *cr_vma) +{ + struct mm_struct *mm = current->mm; + int ret = 0; + + if (!cr_vma->npages) + return 0; + + /* in the unlikely case that this vma is read-only */ + if (!(cr_vma->vm_flags & VM_WRITE)) + ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 1); + + if (!ret) + ret = cr_vma_read_pages_addr(ctx, cr_vma->npages); + if (!ret) + ret = cr_vma_read_pages_data(ctx, cr_vma->npages); + if (ret < 0) + return ret; + + cr_pgarr_release(ctx); /* reset page-array chain */ + + /* restore original protection for this vma */ + if (!(cr_vma->vm_flags & VM_WRITE)) + ret = cr_vma_writable(mm, cr_vma->vm_start, cr_vma->vm_end, 0); + + return ret; +} + +/** + * cr_calc_map_prot_bits - convert vm_flags to mmap protection + * orig_vm_flags: source vm_flags + */ +static unsigned long cr_calc_map_prot_bits(unsigned long orig_vm_flags) +{ + unsigned long vm_prot = 0; + + if (orig_vm_flags & VM_READ) + vm_prot |= PROT_READ; + if (orig_vm_flags & VM_WRITE) + vm_prot |= PROT_WRITE; + if (orig_vm_flags & VM_EXEC) + vm_prot |= PROT_EXEC; + if (orig_vm_flags & PROT_SEM) /* only (?) with IPC-SHM */ + vm_prot |= PROT_SEM; + + return vm_prot; +} + +/** + * cr_calc_map_flags_bits - convert vm_flags to mmap flags + * orig_vm_flags: source vm_flags + */ +static unsigned long cr_calc_map_flags_bits(unsigned long orig_vm_flags) +{ + unsigned long vm_flags = 0; + + vm_flags = MAP_FIXED; + if (orig_vm_flags & VM_GROWSDOWN) + vm_flags |= MAP_GROWSDOWN; + if (orig_vm_flags & VM_DENYWRITE) + vm_flags |= MAP_DENYWRITE; + if (orig_vm_flags & VM_EXECUTABLE) + vm_flags |= MAP_EXECUTABLE; + if (orig_vm_flags & VM_MAYSHARE) + vm_flags |= MAP_SHARED; + else + vm_flags |= MAP_PRIVATE; + + return vm_flags; +} + +static int cr_read_vma(struct cr_ctx *ctx, struct mm_struct *mm) +{ + struct cr_hdr_vma *hh = cr_hbuf_get(ctx, sizeof(*hh)); + unsigned long vm_size, vm_flags, vm_prot, vm_pgoff; + void *addr; + unsigned long flags; + struct file *file = NULL; + char *fname = NULL; + int ret; + + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_VMA); + if (ret < 0) + return ret; + + pr_debug("vma %#lx-%#lx npages %d namelen %d\n", + (unsigned long) hh->vm_start, (unsigned long) hh->vm_end, + (int) hh->npages, (int) hh->namelen); + + if (hh->vm_end < hh->vm_start) + return -EINVAL; + if (hh->npages < 0 || hh->namelen < 0) + return -EINVAL; + + vm_size = hh->vm_end - hh->vm_start; + vm_prot = cr_calc_map_prot_bits(hh->vm_flags); + vm_flags = cr_calc_map_flags_bits(hh->vm_flags); + vm_pgoff = hh->vm_pgoff; + + if (hh->namelen) { + fname = ctx->tbuf; + ret = cr_read_str(ctx, fname, PAGE_SIZE); + if (ret < 0) + return ret; + } + + pr_debug("vma fname '%s' how %d\n", fname, hh->how); + + switch (hh->how) { + + case CR_VMA_ANON: /* anonymous private mapping */ + if (hh->namelen) + return -EINVAL; + /* vm_pgoff for anonymous mapping is the "global" page + offset (namely from addr 0x0), so we force a zero */ + vm_pgoff = 0; + break; + + case CR_VMA_FILE: /* private mapping from a file */ + if (!hh->namelen) + return -EINVAL; + /* O_RDWR only needed if both (VM_WRITE|VM_SHARED) are set */ + flags = hh->vm_flags & (VM_WRITE | VM_SHARED); + flags = (flags == (VM_WRITE | VM_SHARED) ? O_RDWR : O_RDONLY); + file = filp_open(fname, flags, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + break; + + default: + return -EINVAL; + + } + + addr = (void *)do_mmap_pgoff(file, (unsigned long) hh->vm_start, + vm_size, vm_prot, vm_flags, vm_pgoff); + pr_debug("vma size %#lx prot %#lx flags %#lx pgoff %#lx => %p\n", + vm_size, vm_prot, vm_flags, vm_pgoff, addr); + + /* the file (if opened) is now referenced by the vma */ + if (file) + filp_close(file, NULL); + + if (IS_ERR(addr)) + return PTR_ERR(addr); + + /* + * CR_VMA_ANON: read in memory as is + * CR_VMA_FILE: read in memory as is + * (more to follow ...) + */ + + switch (hh->how) { + case CR_VMA_ANON: + case CR_VMA_FILE: + /* standard case: read the data into the memory */ + ret = cr_vma_read_pages(ctx, hh); + break; + } + + if (ret < 0) + return ret; + + if (vm_prot & PROT_EXEC) + flush_icache_range(hh->vm_start, hh->vm_end); + + cr_hbuf_put(ctx, sizeof(*hh)); + pr_debug("vma retval %d\n", ret); + return 0; +} + +static int cr_destroy_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vmnext = mm->mmap; + struct vm_area_struct *vma; + int ret; + + while (vmnext) { + vma = vmnext; + vmnext = vmnext->vm_next; + ret = do_munmap(mm, vma->vm_start, vma->vm_end-vma->vm_start); + if (ret < 0) + return ret; + } + return 0; +} + +int cr_read_mm(struct cr_ctx *ctx) +{ + struct cr_hdr_mm *hh = cr_hbuf_get(ctx, sizeof(*hh)); + struct mm_struct *mm; + int nr, ret; + + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM); + if (ret < 0) + return ret; + + pr_debug("map_count %d\n", hh->map_count); + + /* XXX need more sanity checks */ + if (hh->start_code > hh->end_code || + hh->start_data > hh->end_data || hh->map_count < 0) + return -EINVAL; + + mm = current->mm; + + /* point of no return -- destruct current mm */ + down_write(&mm->mmap_sem); + ret = cr_destroy_mm(mm); + up_write(&mm->mmap_sem); + + if (ret < 0) + return ret; + + mm->start_code = hh->start_code; + mm->end_code = hh->end_code; + mm->start_data = hh->start_data; + mm->end_data = hh->end_data; + mm->start_brk = hh->start_brk; + mm->brk = hh->brk; + mm->start_stack = hh->start_stack; + mm->arg_start = hh->arg_start; + mm->arg_end = hh->arg_end; + mm->env_start = hh->env_start; + mm->env_end = hh->env_end; + + /* FIX: need also mm->flags */ + + for (nr = hh->map_count; nr; nr--) { + ret = cr_read_vma(ctx, mm); + if (ret < 0) + return ret; + } + + cr_hbuf_put(ctx, sizeof(*hh)); + + return cr_read_mm_context(ctx, mm); +} diff -puN checkpoint/sys.c~0005-checkpoint-restart-memory-management checkpoint/sys.c --- oren-cr.git/checkpoint/sys.c~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/checkpoint/sys.c 2008-08-20 12:12:49.000000000 -0700 @@ -15,6 +15,7 @@ #include #include "ckpt.h" +#include "ckpt_mem.h" /* * helpers to write/read to/from the image file descriptor @@ -118,8 +119,7 @@ void cr_ctx_free(struct cr_ctx *ctx) if (ctx->vfsroot) path_put(ctx->vfsroot); - free_pages((unsigned long) ctx->tbuf, CR_ORDER_TBUF); - free_pages((unsigned long) ctx->hbuf, CR_ORDER_HBUF); + cr_pgarr_free(ctx); kfree(ctx); } @@ -132,11 +132,6 @@ struct cr_ctx *cr_ctx_alloc(pid_t pid, s if (!ctx) return NULL; - ctx->tbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_TBUF); - ctx->hbuf = (void *) __get_free_pages(GFP_KERNEL, CR_ORDER_HBUF); - if (!ctx->tbuf || !ctx->hbuf) - goto nomem; - ctx->pid = pid; ctx->flags = flags; @@ -150,10 +145,6 @@ struct cr_ctx *cr_ctx_alloc(pid_t pid, s ctx->crid = atomic_inc_return(&cr_ctx_count); return ctx; - - nomem: - cr_ctx_free(ctx); - return NULL; } /** diff -puN checkpoint/x86.c~0005-checkpoint-restart-memory-management checkpoint/x86.c --- oren-cr.git/checkpoint/x86.c~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/checkpoint/x86.c 2008-08-20 12:12:49.000000000 -0700 @@ -1,5 +1,6 @@ #include #include +#include #include #include "ckpt.h" @@ -268,3 +269,85 @@ int cr_read_cpu(struct cr_ctx *ctx) return 0; } + +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm) +{ + struct cr_hdr h; + struct cr_hdr_mm_context *hh = ctx->tbuf; + int ret; + + h.type = CR_HDR_MM_CONTEXT; + h.len = sizeof(*hh); + h.id = ctx->pid; + + mutex_lock(&mm->context.lock); + + hh->ldt_entry_size = LDT_ENTRY_SIZE; + hh->nldt = mm->context.size; + + pr_debug("nldt %d\n", hh->nldt); + + ret = cr_write_obj(ctx, &h, hh); + if (ret < 0) + return ret; + + ret = cr_kwrite(ctx, mm->context.ldt, hh->nldt * LDT_ENTRY_SIZE); + + mutex_unlock(&mm->context.lock); + + return ret; +} + +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm) +{ + struct cr_hdr_mm_context *hh = cr_hbuf_get(ctx, sizeof(*hh)); + int n, ret; + + ret = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM_CONTEXT); + if (ret < 0) + return ret; + + pr_debug("nldt %d\n", hh->nldt); + + if (hh->nldt < 0 || hh->ldt_entry_size != LDT_ENTRY_SIZE) + return -EINVAL; + + /* to utilize the syscall modify_ldt() we first convert the data + * in the checkpoint image from 'struct desc_struct' to 'struct + * user_desc' with reverse logic of inclue/asm/desc.h:fill_ldt() */ + + for (n = 0; n < hh->nldt; n++) { + struct user_desc info; + struct desc_struct desc; + mm_segment_t old_fs; + + ret = cr_kread(ctx, &desc, LDT_ENTRY_SIZE); + if (ret < 0) + return ret; + + info.entry_number = n; + info.base_addr = desc.base0 | (desc.base1 << 16); + info.limit = desc.limit0; + info.seg_32bit = desc.d; + info.contents = desc.type >> 2; + info.read_exec_only = (desc.type >> 1) ^ 1; + info.limit_in_pages = desc.g; + info.seg_not_present = desc.p ^ 1; + info.useable = desc.avl; + + old_fs = get_fs(); + set_fs(get_ds()); + /* ret = sys_modify_ldt(1, &info, sizeof(info)); */ + /* modified by daveh */ + ret = write_ldt(&info, sizeof(info), 1); + set_fs(old_fs); + + if (ret < 0) + return ret; + } + + load_LDT(&mm->context); + + cr_hbuf_put(ctx, sizeof(*hh)); + return 0; +} diff -puN include/asm-x86/ckpt.h~0005-checkpoint-restart-memory-management include/asm-x86/ckpt.h --- oren-cr.git/include/asm-x86/ckpt.h~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/include/asm-x86/ckpt.h 2008-08-20 12:12:49.000000000 -0700 @@ -43,4 +43,9 @@ struct cr_hdr_cpu { union thread_xstate xstate; /* i387 */ }; +struct cr_hdr_mm_context { + __s16 ldt_entry_size; + __s16 nldt; +}; + #endif /* __ASM_X86_CKPT_H */ diff -puN include/asm-x86/desc.h~0005-checkpoint-restart-memory-management include/asm-x86/desc.h --- oren-cr.git/include/asm-x86/desc.h~0005-checkpoint-restart-memory-management 2008-08-20 12:12:49.000000000 -0700 +++ oren-cr.git-dave/include/asm-x86/desc.h 2008-08-20 12:12:49.000000000 -0700 @@ -111,6 +111,8 @@ static inline void native_write_ldt_entr memcpy(&ldt[entry], desc, 8); } +int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode); + static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type) { @@ -394,7 +396,6 @@ static inline void set_system_gate_ist(i shll $16, base; \ movw idx * 8 + 2(gdt), lo_w; - #endif /* __ASSEMBLY__ */ #endif _ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/