2005-04-18 03:25:59

by Takashi Ikebe

[permalink] [raw]
Subject: [PATCH i386] Live Patching Function on 2.6.11.7

diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/Makefile linux-2.6.11.7-pannus-i386/arch/i386/kernel/Makefile
--- linux-2.6.11.7-vanilla/arch/i386/kernel/Makefile 2005-04-08 03:57:22.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/Makefile 2005-04-18 12:32:13.000000000 +0900
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
- doublefault.o quirks.o
+ doublefault.o quirks.o accesspvm.o exechandle.o

obj-y += cpu/
obj-y += timers/
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/accesspvm.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/accesspvm.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/accesspvm.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/accesspvm.c 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,128 @@
+/*
+ * accesspvm.c
+ * Copyright (C) 2004 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Provide the system call to read/write the specific data in the user process.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Provide the system call to read/write the specific data in the user process.
+ * param pid : process ID
+ * param addr : address of target's memory
+ * param datap : address of the user space memory
+ * param len : length of the kernel space memory to get
+ * param flag : flag which specifies action(read:0, write:1)
+ * return : error code(parameter error:EPERM, no-memory error:ENOMEM, I/O error:EIO)
+ */
+asmlinkage int sys_accesspvm(long pid, unsigned long addr, long datap, int len, int flag)
+{
+ struct task_struct *tsk;
+ int ret = -EPERM;
+ long *p = NULL;
+
+ // get the task_struct specified by pid.
+ read_lock(&tasklist_lock); // lock tasklist
+ tsk = find_task_by_pid(pid);
+ if (tsk)
+ get_task_struct(tsk); // get task_struct
+ read_unlock(&tasklist_lock); // unlock tasklist
+ if (!tsk) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ // if pid<1, then paramter error.
+ if (pid <= 1) // you may not mess with kernel thread or init.
+ goto out_tsk;
+
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+
+ // allocate memory for temporal buffer.
+ p = vmalloc(len);
+ if(!p){
+ printk("accesspvm: Cannot allocate by vmalloc\n");
+ ret = -ENOMEM;
+ goto out_tsk;
+ }
+
+ if(flag == 0){
+ // Read the data in the specified task
+ if(access_process_vm(tsk, addr, p, len, flag) != len) {
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ // copy kernel space data to user space.
+ if(copy_to_user((void *)datap,(const void *)p,len)){
+ printk("accesspvm: Copy_to_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+ ret = 0;
+ vfree(p);
+
+ }
+ else if(flag == 1) {
+ // copy user space data to kernel space.
+ if(copy_from_user(p,(void *)datap,len)){
+ printk("accesspvm: Copy_from_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ // change the data of specified task.
+ if (access_process_vm(tsk, addr, p, len, flag) == len){
+ vfree(p);
+ ret = 0;
+ goto out_tsk;
+ }
+ ret = -EIO;
+ vfree(p);
+ }
+ else {
+ vfree(p);
+ }
+
+out_tsk:
+ put_task_struct(tsk); // release the task_struct
+out:
+ return ret;
+}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/asm-offsets.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/asm-offsets.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/asm-offsets.c 2005-04-08 03:57:30.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/asm-offsets.c 2005-04-18 12:32:13.000000000 +0900
@@ -52,6 +52,7 @@ void foo(void)
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
+ OFFSET(TI_inipending, thread_info, inipending);
BLANK();

OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/entry.S linux-2.6.11.7-pannus-i386/arch/i386/kernel/entry.S
--- linux-2.6.11.7-vanilla/arch/i386/kernel/entry.S 2005-04-08 03:57:26.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/entry.S 2005-04-18 12:32:13.000000000 +0900
@@ -172,8 +172,15 @@ ENTRY(resume_userspace)
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
jne work_pending
+ cmpl $0,TI_inipending(%ebp) #for live patching fook.
+ jne resume_init
jmp restore_all

+resume_init:
+ movl $0,TI_inipending(%ebp)
+ call do_init
+ jmp resume_userspace
+
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
cli
@@ -263,6 +270,9 @@ restore_all:
# perform work that needs to be done immediately before resumption
ALIGN
work_pending:
+ cmpl $0,TI_inipending(%ebp)
+ jne work_init
+work_pending2:
testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
@@ -297,6 +307,29 @@ work_notifysig_v86:
call do_notify_resume
jmp restore_all

+ # perform live patching
+ ALIGN
+work_init:
+ testl $VM_MASK, EFLAGS(%esp)
+ movl %esp, %eax
+ jne work_init_v86
+
+ movl $0,TI_inipending(%ebp)
+ xorl %edx, %edx
+ call do_init
+ jmp work_pending2
+
+ ALIGN
+work_init_v86:
+ movl $0,TI_inipending(%ebp)
+ pushl %ecx # save ti_flags for do_notify_resume
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ movl %eax, %esp
+ xorl %edx, %edx
+ call do_init_v86
+ jmp work_pending2
+
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
@@ -862,5 +895,11 @@ ENTRY(sys_call_table)
.long sys_add_key
.long sys_request_key
.long sys_keyctl
+ .long sys_mmap3
+ .long sys_accesspvm /* 290 */
+ .long sys_init_pend
+ .long sys_rt_handlereturn
+ .long sys_check_init
+ .long sys_munmap3

syscall_table_size=(.-sys_call_table)
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/exechandle.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/exechandle.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/exechandle.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/exechandle.c 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,611 @@
+/*
+ * exechandle.c
+ * Copyright (C) 2004-2005 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Initalization module.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/tty.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <linux/binfmts.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/exechandle.h>
+
+//#define DEBUG_INI 1
+#define round_down(x,y) ((x) & ~((y)-1))
+
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where);
+
+/*
+ * Initialization frame
+ * Store the stack whenever initialize.
+ */
+struct rt_initframe
+{
+ char *pretcode; /* Return address after _init */
+ struct ucontext uc; /* user mode context before execute _init */
+ struct siginfo info; /* signal information before execute _init */
+};
+
+/*
+ * Restore the context before execute _init.
+ * param:regs register struct
+ * param:sc context before _init
+ * param:prax pointer for rax register
+ * return:normal:return value from __get_user/illegal:1
+ */
+
+static int
+restore_initcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+{
+ unsigned int err = 0;
+
+
+#define COPY(x) err |= __get_user(regs->x, &sc->x)
+
+#define COPY_SEG(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->x##seg = tmp; }
+
+#define COPY_SEG_STRICT(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ regs->x##seg = tmp|3; }
+
+#define GET_SEG(seg) \
+ { unsigned short tmp; \
+ err |= __get_user(tmp, &sc->seg); \
+ loadsegment(seg,tmp); }
+
+#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \
+ X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+
+ GET_SEG(gs);
+ GET_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
+ COPY(edi);
+ COPY(esi);
+ COPY(ebp);
+ COPY(esp);
+ COPY(ebx);
+ COPY(edx);
+ COPY(ecx);
+ COPY(eip);
+ COPY_SEG_STRICT(cs);
+ COPY_SEG_STRICT(ss);
+
+ {
+ unsigned int tmpflags;
+ err |= __get_user(tmpflags, &sc->eflags);
+ regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_eax = -1; /* disable syscall checks */
+ }
+
+ {
+ struct _fpstate __user * buf;
+ err |= __get_user(buf, &sc->fpstate);
+
+ }
+
+ err |= __get_user(*peax, &sc->eax);
+ return err;
+
+}
+
+
+asmlinkage long sys_rt_handlereturn(unsigned long __unused)
+{
+ struct pt_regs *regs = (struct pt_regs *) &__unused;
+ struct rt_initframe *frame = (struct rt_initframe *)(regs->esp - 4);
+ stack_t st;
+ int eax;
+ struct task_struct *me = current;
+
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:01\n");
+
+ printk("frame address = %p\n",frame);
+ printk("esp: %lx\n",regs->esp);
+ printk("eip: %lx\n",regs->eip);
+ printk("edx: %lx\n",regs->edx);
+ printk("esi: %lx\n",regs->esi);
+#endif
+ /* Check frame pointer */
+ if (verify_area(VERIFY_READ, frame, sizeof(*frame))) {
+ goto badframe;
+ }
+
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:02\n");
+#endif
+ /* Restore hardware context */
+ if (restore_initcontext(regs, &frame->uc.uc_mcontext, &eax)) {
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/restore_initcontext:01\n");
+#endif
+ goto badframe;
+ }
+
+#if DEBUG_INI
+ printk("%d sigreturn rip:%lx rsp:%lx frame:%p eax:%d\n",current->pid,regs->eip,regs->esp,frame,eax);
+#endif
+ /* Shift stack pointer */
+ if (__copy_from_user(&st, &frame->uc.uc_stack, sizeof(st))) {
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/copy_from_user:01\n");
+#endif
+ goto badframe;
+ }
+
+ /* Clear initalization flag */
+ me->thread_info->inifinish=0;
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn:03\n");
+ printk("me->thread_info->inifinish = 0\n");
+#endif
+ return eax;
+
+ badframe:
+ me->thread_info->inifinish=-1;
+#if DEBUG_INI
+ printk("INIT_CP:sys_rt_handlereturn/badframe\n");
+ printk("me->thread_info->inifinish = -1\n");
+#endif
+ init_fault(regs,frame,me,"handlereturn");
+ return 0;
+}
+
+/*
+ * Set up hardware context for initialization.
+ * param:sc context before initialization
+ * param:regs register struct
+ * param:mask signal mask
+ * param:me current task struct
+ * return:normal:return value from __put_user/illegal:none
+ */
+
+static inline int
+setup_initcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+ int tmp, err = 0;
+
+ tmp = 0;
+ __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+ __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+
+ err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+ err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+ err |= __put_user(regs->edi, &sc->edi);
+ err |= __put_user(regs->esi, &sc->esi);
+ err |= __put_user(regs->ebp, &sc->ebp);
+ err |= __put_user(regs->esp, &sc->esp);
+ err |= __put_user(regs->ebx, &sc->ebx);
+ err |= __put_user(regs->edx, &sc->edx);
+ err |= __put_user(regs->ecx, &sc->ecx);
+ err |= __put_user(regs->eax, &sc->eax);
+ err |= __put_user(me->thread.trap_no, &sc->trapno);
+ err |= __put_user(me->thread.error_code, &sc->err);
+ err |= __put_user(regs->eip, &sc->eip);
+ err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+ err |= __put_user(regs->eflags, &sc->eflags);
+ err |= __put_user(regs->esp, &sc->esp_at_signal);
+ err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+
+ /* non-iBCS2 extensions.. */
+ err |= __put_user(mask, &sc->oldmask);
+ err |= __put_user(current->thread.cr2, &sc->cr2);
+
+ return err;
+}
+
+
+/*
+ * Fix stack pointer.
+ * param:regs register struct
+ * param:size stack size
+ * return:normal:stack pointer value/illegal:none
+ */
+static inline void __user *
+get_initframe(struct pt_regs * regs, size_t frame_size)
+{
+ unsigned long esp;
+
+ /* Default to using normal stack */
+ esp = regs->esp;
+
+ return (void __user *)((esp - frame_size) & -8ul);
+}
+
+
+/*
+ * Set initialization frame and register.
+ * param:ka information for initialization
+ * param:regs register struct
+ * param:set signal set
+ * param:me current task struct
+ * return:none
+ */
+static void setup_init_frame(struct k_initaction *ka, struct pt_regs * regs,
+ sigset_t *set, struct task_struct *me)
+{
+ struct rt_initframe __user *frame;
+ int err = 0;
+ frame = get_initframe(regs, sizeof(struct rt_initframe)) - 8;
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ goto give_sigsegv;
+
+ /* Create the ucontext. */
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ err |= __put_user(sas_ss_flags(regs->esp),
+ &frame->uc.uc_stack.ss_flags);
+ err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_initcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+ //err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up to return from userspace. */
+ err |= __put_user(ka->ia.restorer, &frame->pretcode);
+
+
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up registers for signal handler */
+ regs->esp = (unsigned long) frame;
+ regs->eip = (unsigned long) ka->ia.inithandler;
+ regs->eax = (unsigned long) 0;
+ regs->edx = (unsigned long) &frame->info;
+ regs->ecx = (unsigned long) &frame->uc;
+
+ set_fs(USER_DS);
+ regs->xds = __USER_DS;
+ regs->xes = __USER_DS;
+ regs->xss = __USER_DS;
+ regs->xcs = __USER_CS;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->eflags &= ~TF_MASK;
+
+#if DEBUG_INI
+ printk("INI deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
+ current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+ return;
+
+give_sigsegv:
+ me->thread_info->inipending=0;
+ me->thread_info->inifinish=-1;
+ init_fault(regs,frame,me,"handle deliver");
+ return;
+}
+
+
+/*
+ * Check caller and invoke setup_init_frame.
+ * param:regs register struct
+ * param:ka information for initialization
+ * param:oldset signal set
+ * return:none
+ */
+void
+handle_init(struct pt_regs *regs, struct k_initaction *ka, sigset_t *oldset)
+{
+
+#if DEBUG_INI
+ printk("INIT_CP:handle_init:01\n");
+#endif
+ /* Check the caller is kernel or not */
+ if (regs->orig_eax >= 0) {
+ /* Return EINTER, if the caller is during systemcall. */
+ switch (regs->eax) {
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->eax = -EINTR;
+#if DEBUG_INI
+ printk("ERESTARTNOHAN\n");
+#endif
+ break;
+
+ case -ERESTARTSYS:
+ regs->eax = -EINTR;
+#if DEBUG_INI
+ printk("ERESTARTSYS\n");
+#endif
+ break;
+ /* Skip if the value in rax is error from the beginning.*/
+ case -ERESTARTNOINTR:
+ regs->eax = regs->orig_eax;
+ regs->eip -= 2;
+#if DEBUG_INI
+ printk("ERESTARTNOINTR\n");
+#endif
+ break;
+
+ default:
+#if DEBUG_INI
+ printk("regs->eax=%ld\n",regs->eax);
+#endif
+ break;
+ }
+ }
+
+ setup_init_frame(ka, regs, oldset, current);
+
+}
+void do_init_v86(struct pt_regs *regs, sigset_t *oldset)
+{
+#if DEBUG_INI
+ printk("do_init_v86\n");
+ do_init(regs,oldset);
+#endif
+}
+
+/*
+ * Check the register and invoke handle_init.
+ * param:regs register struct
+ * param:oldset signal set
+ * return:none
+ */
+void do_init(struct pt_regs *regs, sigset_t *oldset)
+{
+
+ struct k_initaction *ka=&current->k_ia;
+
+#if DEBUG_INI
+ printk("INIT_CP:do_init:01\n");
+#endif
+ /* Exit if the third flag of CS register is not 3. */
+
+ if ((regs->xcs & 3) != 3) {
+#if DEBUG_INI
+ printk("regs->xcs != 3\n");
+ printk("current->thread_info->inifinish = 2\n");
+#endif
+ current->thread_info->inifinish=2;
+ return;
+ }
+
+ /* Block if there is signal set.*/
+ if (!oldset){
+#if DEBUG_INI
+ printk("!oldset\n");
+#endif
+ oldset = &current->blocked;
+ }
+
+ /* Clear debug watch point register.*/
+ if (current->thread.debugreg[7]){
+#if DEBUG_INI
+ printk("you have current->thread.debugreg[7]\n");
+#endif
+ asm volatile("movl %0,%%db7" :: "r" (current->thread.debugreg[7]));
+ }
+
+ handle_init(regs,ka,oldset);
+
+ return;
+}
+
+/*
+ * Output error in case of illegal.
+ * param:regs register struct
+ * param:frame stack frame
+ * param:me current task struct
+ * param:where output log string
+ * return:none
+ */
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where)
+{
+
+#if DEBUG_INI
+ printk("INIT_CP:init_fault:01\n");
+#endif
+ /* Output messages if it is illegal.*/
+ printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+ me->comm,me->pid,where,frame,regs->eip,regs->esp,regs->orig_eax);
+
+}
+
+/*
+ * Set inipending flag.
+ * param:target PID for setting pid flag
+ * param:user_k_ia initialization information
+ * return:normal:0/illegal:error code
+ */
+asmlinkage int sys_init_pend(pid_t pid, struct k_initaction *user_k_ia)
+{
+ struct k_initaction ka;
+ struct task_struct *tsk;
+ int error;
+
+#if DEBUG_INI
+ printk("sys_init_pend\n");
+#endif
+ /* Copy initialization information from user area to kernel area. */
+ error = -EFAULT;
+ if(copy_from_user(&ka,user_k_ia,sizeof(ka)))
+ goto out;
+
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1)
+ goto out;
+
+ /* Get task struct from PID. */
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if(!tsk)
+ goto out;
+
+ /* capability check. */
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+ error = -EPERM;
+ return error;
+ }
+
+ error=-EPERM;
+ /* flag pending */
+#if DEBUG_INI
+ printk("tsk->thread_info->inipending = 1\n");
+ printk("tsk->thread_info->inifinish = 1\n");
+#endif
+ tsk->thread_info->inipending=1;
+ tsk->thread_info->inifinish=1;
+
+ /* set k_ia */
+ tsk->k_ia = ka;
+
+ smp_mb();
+
+#if DEBUG_INI
+ switch(tsk->state) {
+ case TASK_INTERRUPTIBLE:
+ printk("INIT_CP:task-state: TASK_INTERRUPTIBLE\n");
+ break;
+ case TASK_STOPPED:
+ printk("INIT_CP:task-state: TASK_STOPPED\n");
+ break;
+ case TASK_RUNNING:
+ printk("INIT_CP:task-state: TASK_RUNNING\n");
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ printk("INIT_CP:task-state: TASK_UNINTERRUPTIBLE\n");
+ break;
+ default:
+ printk("INIT_CP:task-state: Others\n");
+ }
+#endif
+
+ return 0;
+out:
+ return error;
+}
+
+/*
+ * Check inifinish.
+ * param:pid target pid of flag check
+ * return:normal:0/illegal:error code
+ */
+asmlinkage int sys_check_init(pid_t pid)
+{
+ struct task_struct *tsk;
+ int error;
+
+#if DEBUG_INI
+ printk("sys_check_init,pid=%d\n",pid);
+#endif
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1){
+ printk("bad parameter,pid=%d\n",pid);
+ goto out;
+ }
+ /* Get task struct from pid.*/
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+#if DEBUG_INI
+ printk("task=%p\n",tsk);
+#endif
+ if(!tsk){
+#if DEBUG_INI
+ printk("sys_check_init,can not find task_struct by pid\n");
+#endif
+ goto out;
+ }
+ /* capability check. */
+ error = -EPERM;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+#if DEBUG_INI
+ printk("sys_check_init,Invalid user\n");
+#endif
+ goto out;
+ }
+
+ /*
+ * Check inifinish in task struct.
+ * If it is 0, return 0, if it is -1, return -1, else return EAGAIN.
+ */
+ if(tsk->thread_info->inifinish==0){
+ return 0;
+ }else if(tsk->thread_info->inifinish==-1){
+ error = -EINVAL;
+ printk("inifnich = -1, invalid value\n");
+ goto out;
+ }else if(tsk->thread_info->inifinish==2){
+ current->thread_info->inifinish=1;
+ current->thread_info->inipending=1;
+ return 1; //means retry attach/dettach
+ }else{
+ error = -EAGAIN;
+ printk("try again! error=%d, -EAGAIN=%d\n",error,-EAGAIN);
+ goto out;
+ }
+
+out:
+ return error;
+}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/kernel/sys_i386.c linux-2.6.11.7-pannus-i386/arch/i386/kernel/sys_i386.c
--- linux-2.6.11.7-vanilla/arch/i386/kernel/sys_i386.c 2005-04-08 03:58:31.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/kernel/sys_i386.c 2005-04-18 12:32:13.000000000 +0900
@@ -19,6 +19,7 @@
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/utsname.h>
+#include <linux/sched.h>

#include <asm/uaccess.h>
#include <asm/ipc.h>
@@ -44,10 +45,11 @@ asmlinkage int sys_pipe(unsigned long __
static inline long do_mmap2(
unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
+ unsigned long fd, unsigned long pgoff, int pid)
{
int error = -EBADF;
struct file * file = NULL;
+ struct task_struct *tsk;

flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
if (!(flags & MAP_ANONYMOUS)) {
@@ -55,10 +57,34 @@ static inline long do_mmap2(
if (!file)
goto out;
}
+ if(pid > 0){
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (!tsk)
+ goto out;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+
+ down_write(&tsk->mm->mmap_sem);
+ error = do_mmap_pgoff2(file, addr, len, prot, flags, pgoff, tsk);
+ up_write(&tsk->mm->mmap_sem);
+ } else {
+

down_write(&current->mm->mmap_sem);
error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&current->mm->mmap_sem);
+ }

if (file)
fput(file);
@@ -70,7 +96,44 @@ asmlinkage long sys_mmap2(unsigned long
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
{
- return do_mmap2(addr, len, prot, flags, fd, pgoff);
+ return do_mmap2(addr, len, prot, flags, fd, pgoff,0);
+}
+
+/*
+ * Provide the mmap3 system call which maps the file to the specified process's memory.
+ * param arg : struct which has memory mapping information
+ * return : normal end : the start address of mapped memory
+ * return : abnormal end : error code(can't read the struct:EFAULT, parameter error of pgoff:EINVAL, parameter error of file:EBADF)
+ */
+asmlinkage long sys_mmap3(struct _mmap3_arg_struct __user *arg)
+{
+ long error;
+ struct _mmap3_arg_struct a;
+ int ret;
+ // copy the struct in user space to kernel space
+ error = -EFAULT;
+ printk("sys_mmap3 called, arg=%p,&(arg.addr)=%p\n",arg,&(arg->addr));
+ printk("arg.addr=%lx,arg.len=%lx,arg.prot=%lx",arg->addr,arg->len,arg->prot);
+ printk("arg.flags=%lx,arg.fd=%lx,arg.pgoff=%lx,arg.pid=%lx\n",arg->flags,arg->fd,arg->pgoff,arg->pid);
+/* ret=access_ok(VERIFY_READ,arg,sizeof(a));
+ if(ret){
+ printk("access_ok! sizeof(a)=%d,%dbyte readed\n",sizeof(a),ret);
+ }*/
+
+ ret=copy_from_user(&a, arg, sizeof(a));
+ if(ret){
+ printk("mmap3 copy_from_user error.. %d byte left\n",ret);
+ printk("addr=%lx,len=%lx,prot=%lx,flags=%lx,fd=%lx,pgoff=%lx,pid=%ld\n",a.addr,a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+ goto out;
+ }else{
+ printk("Copy_from_User finish collecty, %dbytes left...\n", ret);
+ printk("addr=%lx,len=%lx,prot=%lx,flags=%lx,fd=%lx,pgoff=%lx,pid=%ld\n",a.addr,a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+ }
+ error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff, a.pid);
+
+ out:
+ printk("mmap3 out, return=%lx\n",error);
+ return error;
}

/*
@@ -101,7 +164,7 @@ asmlinkage int old_mmap(struct mmap_arg_
if (a.offset & ~PAGE_MASK)
goto out;

- err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT);
+ err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT,0);
out:
return err;
}
diff -urpN linux-2.6.11.7-vanilla/arch/i386/mm/mmap.c linux-2.6.11.7-pannus-i386/arch/i386/mm/mmap.c
--- linux-2.6.11.7-vanilla/arch/i386/mm/mmap.c 2005-04-08 03:57:36.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/arch/i386/mm/mmap.c 2005-04-18 12:32:13.000000000 +0900
@@ -62,10 +62,12 @@ void arch_pick_mmap_layout(struct mm_str
current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->get_unmapped_area2 = arch_get_unmapped_area2;
mm->unmap_area = arch_unmap_area;
} else {
mm->mmap_base = mmap_base(mm);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ mm->get_unmapped_area2 = arch_get_unmapped_area_topdown2;
mm->unmap_area = arch_unmap_area_topdown;
}
}
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/exechandle.h linux-2.6.11.7-pannus-i386/include/asm-i386/exechandle.h
--- linux-2.6.11.7-vanilla/include/asm-i386/exechandle.h 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/exechandle.h 2005-04-18 12:32:13.000000000 +0900
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_64_EXECHANDLE_H
+#define _ASM_X86_64_EXECHANDLE_H
+
+#include <asm/types.h>
+#include <asm/signal.h>
+
+
+struct initaction
+{
+ void (*inithandler)(int);
+ void (*restorer)(void);
+};
+
+struct k_initaction
+{
+ struct initaction ia;
+};
+
+void do_init(struct pt_regs *regs, sigset_t *oldset);
+
+#endif
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/thread_info.h linux-2.6.11.7-pannus-i386/include/asm-i386/thread_info.h
--- linux-2.6.11.7-vanilla/include/asm-i386/thread_info.h 2005-04-08 03:57:14.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/thread_info.h 2005-04-18 12:32:13.000000000 +0900
@@ -44,6 +44,9 @@ struct thread_info {
of nested (IRQ) stacks
*/
__u8 supervisor_stack[0];
+
+ __u32 inipending; /* Pending flags for live patch */
+ __u32 inifinish; /* Finish flags for live patch */
};

#else /* !__ASSEMBLY__ */
diff -urpN linux-2.6.11.7-vanilla/include/asm-i386/unistd.h linux-2.6.11.7-pannus-i386/include/asm-i386/unistd.h
--- linux-2.6.11.7-vanilla/include/asm-i386/unistd.h 2005-04-08 03:57:46.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/asm-i386/unistd.h 2005-04-18 12:32:13.000000000 +0900
@@ -294,8 +294,14 @@
#define __NR_add_key 286
#define __NR_request_key 287
#define __NR_keyctl 288
+#define __NR_mmap3 289
+#define __NR_accesspvm (__NR_mmap3+1)
+#define __NR_init_pend (__NR_mmap3+2)
+#define __NR_rt_handlereturn (__NR_mmap3+3)
+#define __NR_check_init (__NR_mmap3+4)
+#define __NR_munmap3 (__NR_mmap3+5)

-#define NR_syscalls 289
+#define NR_syscalls 294

/*
* user-visible error numbers are in the range -1 - -128: see
diff -urpN linux-2.6.11.7-vanilla/include/linux/capability.h linux-2.6.11.7-pannus-i386/include/linux/capability.h
--- linux-2.6.11.7-vanilla/include/linux/capability.h 2005-04-08 03:57:26.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/capability.h 2005-04-18 12:32:13.000000000 +0900
@@ -288,6 +288,10 @@ typedef __u32 kernel_cap_t;

#define CAP_AUDIT_CONTROL 30

+/* Allow use of memory access system calls for Live Patching */
+
+#define CAP_SYS_PANNUS 31
+
#ifdef __KERNEL__
/*
* Bounding set
diff -urpN linux-2.6.11.7-vanilla/include/linux/mm.h linux-2.6.11.7-pannus-i386/include/linux/mm.h
--- linux-2.6.11.7-vanilla/include/linux/mm.h 2005-04-08 03:57:09.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/mm.h 2005-04-18 12:32:13.000000000 +0900
@@ -614,6 +614,7 @@ extern int install_page(struct mm_struct
extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);

@@ -730,10 +731,16 @@ extern void exit_mmap(struct mm_struct *

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

+extern unsigned long get_unmapped_area2(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, struct task_struct *);
+
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff);

+extern unsigned long do_mmap_pgoff2(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff, struct task_struct *);
+
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
diff -urpN linux-2.6.11.7-vanilla/include/linux/mman.h linux-2.6.11.7-pannus-i386/include/linux/mman.h
--- linux-2.6.11.7-vanilla/include/linux/mman.h 2005-04-08 03:57:13.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/mman.h 2005-04-18 12:32:13.000000000 +0900
@@ -64,4 +64,17 @@ calc_vm_flag_bits(unsigned long flags)
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}

+/*
+ * Struct of argument to pass the mmap3 system call.
+ */
+typedef struct _mmap3_arg_struct {
+ unsigned long addr; /* address where file is loaded */
+ unsigned long len; /* length of data to be maped */
+ unsigned long prot; /* permission of the memory where the file is mapped */
+ unsigned long flags; /* flag of mapped memory */
+ unsigned long fd; /* file descriptor of data to be mapped */
+ unsigned long pgoff; /* page offset of data to be mapped */
+ unsigned long pid; /* process ID */
+} mmap3_arg_struct_t;
+
#endif /* _LINUX_MMAN_H */
diff -urpN linux-2.6.11.7-vanilla/include/linux/sched.h linux-2.6.11.7-pannus-i386/include/linux/sched.h
--- linux-2.6.11.7-vanilla/include/linux/sched.h 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/include/linux/sched.h 2005-04-18 12:32:13.000000000 +0900
@@ -21,6 +21,7 @@
#include <asm/ptrace.h>
#include <asm/mmu.h>
#include <asm/cputime.h>
+#include <asm/exechandle.h>

#include <linux/smp.h>
#include <linux/sem.h>
@@ -197,9 +198,19 @@ extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
+arch_get_unmapped_area2(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long, struct task_struct *);
+
+extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
+
+extern unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, struct task_struct *);
+
extern void arch_unmap_area(struct vm_area_struct *area);
extern void arch_unmap_area_topdown(struct vm_area_struct *area);

@@ -211,6 +222,11 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
+ unsigned long (*get_unmapped_area2) (struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags,
+ struct task_struct * tsk);
+
void (*unmap_area) (struct vm_area_struct *area);
unsigned long mmap_base; /* base of mmap area */
unsigned long free_area_cache; /* first hole */
@@ -685,6 +701,7 @@ struct task_struct {
struct mempolicy *mempolicy;
short il_next;
#endif
+ struct k_initaction k_ia; /*Inialization info for live patch */
};

static inline pid_t process_group(struct task_struct *tsk)
@@ -1173,6 +1190,7 @@ static inline void arch_pick_mmap_layout
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->get_unmapped_area2 = arch_get_unmapped_area2;
mm->unmap_area = arch_unmap_area;
}
#endif
diff -urpN linux-2.6.11.7-vanilla/kernel/fork.c linux-2.6.11.7-pannus-i386/kernel/fork.c
--- linux-2.6.11.7-vanilla/kernel/fork.c 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/kernel/fork.c 2005-04-18 12:32:13.000000000 +0900
@@ -2,6 +2,7 @@
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2004-2005 NTT Corporation
*/

/*
@@ -412,6 +413,12 @@ void mm_release(struct task_struct *tsk,
u32 __user * tidptr = tsk->clear_child_tid;
tsk->clear_child_tid = NULL;

+ /* initialize flag and information for live patch */
+ tsk->thread_info->inipending=0;
+ tsk->thread_info->inifinish=0;
+ tsk->k_ia.ia.inithandler=NULL;
+ tsk->k_ia.ia.restorer=NULL;
+
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
diff -urpN linux-2.6.11.7-vanilla/mm/memory.c linux-2.6.11.7-pannus-i386/mm/memory.c
--- linux-2.6.11.7-vanilla/mm/memory.c 2005-04-08 03:57:36.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/mm/memory.c 2005-04-18 12:32:13.000000000 +0900
@@ -2209,6 +2209,27 @@ int make_pages_present(unsigned long add
return ret == len ? 0 : -1;
}

+int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(tsk->mm, addr);
+ if (!vma)
+ return -1;
+ write = (vma->vm_flags & VM_WRITE) != 0;
+ if (addr >= end)
+ BUG();
+ if (end > vma->vm_end)
+ BUG();
+ len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+ ret = get_user_pages(tsk, tsk->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -1;
+}
+
/*
* Map a vmalloc()-space virtual address to the physical page.
*/
diff -urpN linux-2.6.11.7-vanilla/mm/mmap.c linux-2.6.11.7-pannus-i386/mm/mmap.c
--- linux-2.6.11.7-vanilla/mm/mmap.c 2005-04-08 03:57:45.000000000 +0900
+++ linux-2.6.11.7-pannus-i386/mm/mmap.c 2005-04-18 12:32:13.000000000 +0900
@@ -1143,6 +1143,239 @@ unacct_error:

EXPORT_SYMBOL(do_mmap_pgoff);

+/*
+ * map the data which have the length specified in the file to the memory of
+ * the specified task.(clone of do_mmap_pgoff)
+ */
+
+unsigned long do_mmap_pgoff2(struct file * file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff, struct task_struct *tsk)
+{
+ struct mm_struct * mm = tsk->mm;
+ struct vm_area_struct * vma, * prev;
+ struct inode *inode;
+ unsigned int vm_flags;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node ** rb_link, * rb_parent;
+ int accountable = 1;
+ unsigned long charged = 0;
+
+ if (file) {
+ if (is_file_hugepages(file))
+ accountable = 0;
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
+
+ if ((prot & PROT_EXEC) &&
+ (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
+ return -EPERM;
+ }
+
+ if ((prot & PROT_READ) && (tsk->personality & READ_IMPLIES_EXEC))
+ if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ prot |= PROT_EXEC;
+ if (!len)
+ return addr;
+
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -EINVAL;
+
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EINVAL;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ addr = get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (flags & MAP_LOCKED) {
+ if (!can_do_mlock())
+ return -EPERM;
+ vm_flags |= VM_LOCKED;
+ }
+ if (vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = tsk->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+
+ inode = file ? file->f_dentry->d_inode : NULL;
+
+ if (file) {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
+ return -EACCES;
+
+ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (locks_verify_locked(inode))
+ return -EAGAIN;
+
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ break;
+ case MAP_PRIVATE:
+ pgoff = addr >> PAGE_SHIFT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ error = security_file_mmap(file, prot, flags);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
+ if ((mm->total_vm << PAGE_SHIFT) + len
+ > tsk->signal->rlim[RLIMIT_AS].rlim_cur)
+ return -ENOMEM;
+
+ if (accountable && (!(flags & MAP_NORESERVE) ||
+ sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+ if (vm_flags & VM_SHARED) {
+ vm_flags |= VM_ACCOUNT;
+ } else if (vm_flags & VM_WRITE) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+ }
+
+ if (!file && !(vm_flags & VM_SHARED) &&
+ vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, NULL, pgoff, NULL))
+ goto out;
+
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ memset(vma, 0, sizeof(*vma));
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ error = -EINVAL;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ goto free_vma;
+ if (vm_flags & VM_DENYWRITE) {
+ error = deny_write_access(file);
+ if (error)
+ goto free_vma;
+ correct_wcount = 1;
+ }
+ vma->vm_file = file;
+ get_file(file);
+ error = file->f_op->mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ }
+
+ if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
+ vma->vm_flags &= ~VM_ACCOUNT;
+
+
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
+
+ if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+ vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+ file = vma->vm_file;
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ } else {
+ if (file) {
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ fput(file);
+ }
+ mpol_free(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+ }
+
+out:
+ mm->total_vm += len >> PAGE_SHIFT;
+ __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += len >> PAGE_SHIFT;
+ make_pages_present2(addr, addr + len, tsk);
+ }
+ if (flags & MAP_POPULATE) {
+ up_write(&mm->mmap_sem);
+ sys_remap_file_pages(addr, len, 0,
+ pgoff, flags & MAP_NONBLOCK);
+ down_write(&mm->mmap_sem);
+ }
+ acct_update_integrals();
+ update_mem_hiwater();
+ return addr;
+
+unmap_and_free_vma:
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ vma->vm_file = NULL;
+ fput(file);
+
+ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+free_vma:
+ kmem_cache_free(vm_area_cachep, vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+}
+EXPORT_SYMBOL(do_mmap_pgoff2);
+
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1199,6 +1432,48 @@ full_search:
addr = vma->vm_end;
}
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area)
+ */
+unsigned long
+arch_get_unmapped_area2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+
+ struct mm_struct *mm = tsk->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ start_addr = addr = mm->free_area_cache;
+
+full_search:
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ if (TASK_SIZE - len < addr) {
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ addr = vma->vm_end;
+ }
+}
+
#endif

void arch_unmap_area(struct vm_area_struct *area)
@@ -1300,6 +1575,66 @@ fail:

return addr;
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area_topdown)
+ */
+unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags, struct task_struct *tsk)
+{
+ struct vm_area_struct *vma, *prev_vma;
+ struct mm_struct *mm = tsk->mm;
+ unsigned long base = mm->mmap_base, addr = addr0;
+ int first_time = 1;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (mm->free_area_cache > base)
+ mm->free_area_cache = base;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+try_again:
+ if (mm->free_area_cache < len)
+ goto fail;
+
+ addr = (mm->free_area_cache - len) & PAGE_MASK;
+ do {
+ if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+ return addr;
+
+ if (addr+len <= vma->vm_start &&
+ (!prev_vma || (addr >= prev_vma->vm_end)))
+ return (mm->free_area_cache = addr);
+ else
+ if (mm->free_area_cache == vma->vm_end)
+ mm->free_area_cache = vma->vm_start;
+
+ addr = vma->vm_start-len;
+ } while (len <= vma->vm_start);
+
+fail:
+ if (first_time) {
+ mm->free_area_cache = base;
+ first_time = 0;
+ goto try_again;
+ }
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ addr = arch_get_unmapped_area2(filp, addr0, len, pgoff, flags, tsk);
+ mm->free_area_cache = base;
+ return addr;
+}
+
#endif

void arch_unmap_area_topdown(struct vm_area_struct *area)
@@ -1350,6 +1685,35 @@ get_unmapped_area(struct file *file, uns

EXPORT_SYMBOL(get_unmapped_area);

+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of get_unmapped_area)
+ */
+unsigned long
+get_unmapped_area2(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+ if (flags & MAP_FIXED) {
+ unsigned long ret;
+
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (file && is_file_hugepages(file)) {
+ ret = prepare_hugepage_range(addr, len);
+ } else {
+ ret = is_hugepage_only_range(addr, len);
+ }
+ if (ret)
+ return -EINVAL;
+ return addr;
+ }
+ return tsk->mm->get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+}
+
+EXPORT_SYMBOL(get_unmapped_area2);
+
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
@@ -1878,6 +2242,49 @@ static inline void verify_mm_writelocked
#endif
}

+
+
+/*
+ * Clear the specified mapped area in specified process.
+ * Provide the system call munmap3.
+ * Send memory map information struct to do_munmap.
+ */
+asmlinkage long sys_munmap3(unsigned long addr, size_t len, pid_t pid)
+{
+ int ret;
+ struct mm_struct *mm;
+
+ /* target process task struct */
+ struct task_struct *tsk;
+
+ /* get specified process task struct from pid.*/
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ read_unlock(&tasklist_lock);
+
+ if (!tsk)
+ return -ESRCH;
+
+ // capability check
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in munamp3
+ // EPERM:1 Operation not permitted
+ return -EPERM;
+ }
+
+
+ mm = tsk->mm;
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some


Attachments:
pannus-2.6.11.7-i386.patch (47.38 kB)

2005-04-18 05:21:45

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH i386] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 12:20:31PM +0900, Takashi Ikebe wrote:
> The patch was over 50k, so I separate it to each architecture and in line..
>
> This patch add function called "Live patching" which is defined on
> OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7 kernel.

Traditionally beeing in OSDL specs was a very good reason not to merge patches.

Can you please come up with real arguments instead of this requirements
bullshit. Also I hope OSDL would invest their money in more useful things
than CGL, why has this idiocy still not stopped?