2005-04-18 03:22:38

by Takashi Ikebe

[permalink] [raw]
Subject: [PATCH x86_64] Live Patching Function on 2.6.11.7

diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/Makefile linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/Makefile
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/Makefile 2005-04-08 03:57:55.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/Makefile 2005-04-18 10:45:47.000000000 +0900
@@ -7,7 +7,8 @@ EXTRA_AFLAGS := -traditional
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
x8664_ksyms.o i387.o syscall.o vsyscall.o \
- setup64.o bootflag.o e820.o reboot.o quirks.o
+ setup64.o bootflag.o e820.o reboot.o quirks.o \
+ accesspvm.o exechandle.o

obj-$(CONFIG_X86_MCE) += mce.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/accesspvm.c linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/accesspvm.c
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/accesspvm.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/accesspvm.c 2005-04-18 10:52:31.000000000 +0900
@@ -0,0 +1,111 @@
+/*
+ * accesspvm.c
+ * Copyright (C) 2004 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Provide the system call to read/write the specific data in the user process.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+asmlinkage int sys_accesspvm(long pid, unsigned long addr, long datap, int len, int flag)
+{
+ struct task_struct *tsk;
+ int ret = -EPERM;
+ long *p = NULL;
+
+ read_lock(&tasklist_lock); // lock tasklist
+ tsk = find_task_by_pid(pid);
+ if (tsk)
+ get_task_struct(tsk); // get task_struct
+ read_unlock(&tasklist_lock); // unlock tasklist
+ if (!tsk) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ if (pid <= 1) // you may not mess with kernel thread or init.
+ goto out_tsk;
+
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+
+ p = vmalloc(len);
+ if(!p){
+ printk("accesspvm: Cannot allocate by vmalloc\n");
+ ret = -ENOMEM;
+ goto out_tsk;
+ }
+
+ if(flag == 0){
+ if(access_process_vm(tsk, addr, p, len, flag) != len) {
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ if(copy_to_user((void *)datap,(const void *)p,len)){
+ printk("accesspvm: Copy_to_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+ ret = 0;
+ vfree(p);
+
+ }
+ else if(flag == 1) {
+ if(copy_from_user(p,(void *)datap,len)){
+ printk("accesspvm: Copy_from_user error\n");
+ vfree(p);
+ ret = -EIO;
+ goto out_tsk;
+ }
+
+ if (access_process_vm(tsk, addr, p, len, flag) == len){
+ vfree(p);
+ ret = 0;
+ goto out_tsk;
+ }
+ ret = -EIO;
+ vfree(p);
+ }
+ else {
+ vfree(p);
+ }
+
+out_tsk:
+ put_task_struct(tsk); // release the task_struct
+out:
+ return ret;
+}
diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/asm-offsets.c linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/asm-offsets.c
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/asm-offsets.c 2005-04-08 03:57:42.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/asm-offsets.c 2005-04-18 10:45:47.000000000 +0900
@@ -33,6 +33,7 @@ int main(void)
ENTRY(flags);
ENTRY(addr_limit);
ENTRY(preempt_count);
+ ENTRY(inipending);
BLANK();
#undef ENTRY
#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/entry.S linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/entry.S
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/entry.S 2005-04-08 03:57:30.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/entry.S 2005-04-18 10:45:47.000000000 +0900
@@ -214,6 +214,8 @@ sysret_check:
/* Handle reschedules */
/* edx: work, edi: workmask */
sysret_careful:
+ cmpl $0,threadinfo_inipending(%rcx)
+ jne sysret_init
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
sti
@@ -237,6 +239,16 @@ sysret_signal:
1: movl $_TIF_NEED_RESCHED,%edi
jmp sysret_check

+sysret_init:
+ movl $0,threadinfo_inipending(%rcx)
+ sti
+ xorl %esi,%esi # oldset
+ leaq -ARGOFFSET(%rsp),%rdi # regs
+ leaq do_init(%rip),%rax
+ call ptregscall_common
+ jmp sysret_check
+
+
/* Do syscall tracing */
tracesys:
SAVE_REST
@@ -395,6 +407,23 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC

/*
+ * In the case restorer calls rt_handlereturn, collect and store registers,
+ * and call rt_handlereturn with stored register struct.
+ */
+ENTRY(stub_rt_handlereturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ SAVE_REST
+ movq %rsp,%rdi
+ FIXUP_TOP_OF_STACK %r11
+ call sys_rt_handlereturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+
+
+/*
* Interrupt entry/exit.
*
* Interrupt entry points save only callee clobbered registers in fast path.
@@ -481,6 +510,8 @@ bad_iret:

/* edi: workmask, edx: work */
retint_careful:
+ cmpl $0,threadinfo_inipending(%rcx)
+ jne retint_init
bt $TIF_NEED_RESCHED,%edx
jnc retint_signal
sti
@@ -527,6 +558,21 @@ retint_kernel:
#endif
CFI_ENDPROC

+retint_init:
+ CFI_STARTPROC
+ movl $0,threadinfo_inipending(%rcx)
+ sti
+ SAVE_REST
+ movq $-1,ORIG_RAX(%rsp)
+ xorq %rsi,%rsi # oldset
+ movq %rsp,%rdi # &pt_regs
+ call do_init
+ RESTORE_REST
+ cli
+ GET_THREAD_INFO(%rcx)
+ jmp retint_check
+ CFI_ENDPROC
+
/*
* APIC interrupts.
*/
diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/exechandle.c linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/exechandle.c
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/exechandle.c 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/exechandle.c 2005-04-18 10:58:52.000000000 +0900
@@ -0,0 +1,461 @@
+/*
+ * exechandle.c
+ * Copyright (C) 2004-2005 NTT Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Initalization module.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/tty.h>
+#include <linux/personality.h>
+#include <linux/compiler.h>
+#include <linux/binfmts.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/proto.h>
+#include <asm/exechandle.h>
+
+extern int exception_trace;
+
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where);
+
+struct rt_initframe
+{
+ char *pretcode; /* Return address after _init */
+ struct ucontext uc; /* user mode context before execute _init */
+ struct siginfo info; /* signal information before execute _init */
+};
+
+/*
+ * Restore the context before execute _init.
+ */
+static int
+restore_initcontext(struct pt_regs *regs, struct sigcontext *sc, unsigned long *prax)
+{
+ unsigned int err = 0;
+
+ /* Restore context from stored one before _init. */
+
+#define COPY(x) err |= __get_user(regs->x, &sc->x)
+
+ COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx);
+ COPY(rdx); COPY(rcx); COPY(rip);
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
+
+ {
+ unsigned int tmpflags;
+ err |= __get_user(tmpflags, &sc->eflags);
+ regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+ regs->orig_rax = -1;
+ }
+
+ /* Restore the floating register, if used. */
+ {
+ struct _fpstate * buf;
+ err |= __get_user(buf, &sc->fpstate);
+
+ if (buf) {
+ if (verify_area(VERIFY_READ, buf, sizeof(*buf)))
+ goto badframe;
+ err |= restore_i387(buf);
+ }
+ }
+
+ err |= __get_user(*prax, &sc->rax);
+ return err;
+
+badframe:
+ return 1;
+}
+
+/*
+ * Restart the process with restoring stack.
+ */
+asmlinkage long sys_rt_handlereturn(struct pt_regs regs)
+{
+ struct rt_initframe *frame = (struct rt_initframe *)(regs.rsp - 8);
+ stack_t st;
+ long eax;
+ struct task_struct *me = current;
+
+ /* Check frame pointer */
+ if (verify_area(VERIFY_READ, frame, sizeof(*frame))) {
+ goto badframe;
+ }
+
+ /* Restore hardware context */
+ if (restore_initcontext(&regs, &frame->uc.uc_mcontext, &eax)) {
+ goto badframe;
+ }
+
+ /* Shift stack pointer */
+ if (__copy_from_user(&st, &frame->uc.uc_stack, sizeof(st))) {
+ goto badframe;
+ }
+
+ /* Clear initalization flag */
+ me->thread_info->inifinish=0;
+ return eax;
+
+ badframe:
+ me->thread_info->inifinish=-1;
+ init_fault(&regs,frame,me,"handlereturn");
+ return 0;
+}
+
+/*
+ * Set up hardware context for initialization.
+ */
+static inline int
+setup_initcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+{
+ int err = 0;
+
+ err |= __put_user(0, &sc->gs);
+ err |= __put_user(0, &sc->fs);
+
+ err |= __put_user(regs->rdi, &sc->rdi);
+ err |= __put_user(regs->rsi, &sc->rsi);
+ err |= __put_user(regs->rbp, &sc->rbp);
+ err |= __put_user(regs->rsp, &sc->rsp);
+ err |= __put_user(regs->rbx, &sc->rbx);
+ err |= __put_user(regs->rdx, &sc->rdx);
+ err |= __put_user(regs->rcx, &sc->rcx);
+ err |= __put_user(regs->rax, &sc->rax);
+ err |= __put_user(regs->r8, &sc->r8);
+ err |= __put_user(regs->r9, &sc->r9);
+ err |= __put_user(regs->r10, &sc->r10);
+ err |= __put_user(regs->r11, &sc->r11);
+ err |= __put_user(regs->r12, &sc->r12);
+ err |= __put_user(regs->r13, &sc->r13);
+ err |= __put_user(regs->r14, &sc->r14);
+ err |= __put_user(regs->r15, &sc->r15);
+ err |= __put_user(me->thread.trap_no, &sc->trapno);
+ err |= __put_user(me->thread.error_code, &sc->err);
+ err |= __put_user(regs->rip, &sc->rip);
+ err |= __put_user(regs->eflags, &sc->eflags);
+ err |= __put_user(mask, &sc->oldmask);
+ err |= __put_user(me->thread.cr2, &sc->cr2);
+
+ return err;
+}
+
+/*
+ * Fix stack pointer.
+ */
+static void *
+get_stack(struct pt_regs *regs, unsigned long size)
+{
+ unsigned long rsp;
+
+ /* Shift stack pointer by stack size. */
+ rsp = regs->rsp - 128;
+
+ /* Align page size boudaries */
+ return (void *)round_down(rsp - size, 16);
+}
+
+/*
+ * Set initialization frame and register.
+ */
+static void setup_init_frame(struct k_initaction *ka, struct pt_regs * regs,
+ sigset_t *set, struct task_struct *me)
+{
+ struct rt_initframe *frame;
+ struct _fpstate *fp = NULL;
+ int err = 0;
+
+ /* Store the floating point register, if used. */
+ if (tsk_used_math(me)!=0) {
+
+ fp = get_stack(regs, sizeof(struct _fpstate));
+ frame = (void *)round_down((u64)fp - sizeof(struct rt_initframe), 16) - 8;
+
+ if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) {
+ goto give_sigsegv;
+ }
+
+ if (save_i387(fp) < 0)
+ err |= -1;
+ } else {
+ frame = get_stack(regs, sizeof(struct rt_initframe)) - 8;
+ }
+
+ /* Check accessibility of stack */
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) {
+ goto give_sigsegv;
+ }
+
+ /* Create the ucontext. */
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ err |= __put_user(sas_ss_flags(regs->rsp),
+ &frame->uc.uc_stack.ss_flags);
+ err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= setup_initcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
+ err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
+
+ /* Set the resotrer address as return address from inialization*/
+ err |= __put_user(ka->ia.restorer, &frame->pretcode);
+
+ if (err) {
+ goto give_sigsegv;
+ }
+
+ /* Set handler execution to register */
+ /* Set rax to 0 because _init has no prototype declaration.*/
+ regs->rax = 0;
+
+
+ /*
+ * Take over previous signal information in rsi and user context in uc,
+ * set stack pointer(rsp) to the start address of initialization frame,
+ * set instruction pointer(rip) to _init address, and switchover
+ * user_mode segment by setting cs and rs.
+ */
+ regs->rsi = (unsigned long)&frame->info;
+ regs->rdx = (unsigned long)&frame->uc;
+ regs->rsp = (unsigned long) frame;
+ regs->rip = (unsigned long) ka->ia.inithandler;
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ set_fs(USER_DS);
+ regs->eflags &= ~TF_MASK;
+
+ return;
+
+give_sigsegv:
+ me->thread_info->inipending=0;
+ me->thread_info->inifinish=-1;
+ init_fault(regs,frame,me,"handle deliver");
+ return;
+}
+
+/*
+ * Check caller and invoke setup_init_frame.
+ * param:regs register struct
+ * param:ka information for initialization
+ * param:oldset signal set
+ * return:none
+ */
+void
+handle_init(struct pt_regs *regs, struct k_initaction *ka, sigset_t *oldset)
+{
+
+
+ /* Check the caller is kernel or not */
+ if ((long)regs->orig_rax >= 0) {
+ /* Return EINTER, if the caller is during systemcall. */
+ switch (regs->rax) {
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->rax = -EINTR;
+ break;
+
+ case -ERESTARTSYS:
+ regs->rax = -EINTR;
+ break;
+ /* Skip if the value in rax is error from the beginning.*/
+ case -ERESTARTNOINTR:
+ regs->rax = regs->orig_rax;
+ regs->rip -= 2;
+ break;
+
+ default:
+
+ break;
+ }
+ }
+
+ setup_init_frame(ka, regs, oldset, current);
+
+}
+
+
+/*
+ * Check the register and invoke handle_init.
+ */
+void do_init(struct pt_regs *regs, sigset_t *oldset)
+{
+
+ struct k_initaction *ka=&current->k_ia;
+
+ /* Exit if the third flag of CS register is not 3. */
+ if ((regs->cs & 3) != 3) {
+ current->thread_info->inifinish=-1;
+ return;
+ }
+
+ /* Block if there is signal set.*/
+ if (!oldset){
+ oldset = &current->blocked;
+ }
+
+ /* Clear debug watch point register.*/
+ if (current->thread.debugreg7){
+ asm volatile("movq %0,%%db7" :: "r" (current->thread.debugreg7));
+ }
+
+ handle_init(regs,ka,oldset);
+
+ return;
+}
+
+/*
+ * Output error in case of illegal.
+ */
+void init_fault(struct pt_regs *regs, void *frame, struct task_struct *me, char *where)
+{
+
+ /* Output messages if it is illegal.*/
+ if (exception_trace)
+ printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n",
+ me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax);
+
+}
+
+/*
+ * Set inipending flag.
+ */
+asmlinkage int sys_init_pend(pid_t pid, struct k_initaction *user_k_ia)
+{
+ struct k_initaction ka;
+ struct task_struct *tsk;
+ int error;
+
+ /* Copy initialization information from user area to kernel area. */
+ error = -EFAULT;
+ if(copy_from_user(&ka,user_k_ia,sizeof(ka)))
+ goto out;
+
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1)
+ goto out;
+
+ /* Get task struct from PID. */
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if(!tsk)
+ goto out;
+
+ /* capability check. */
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+ error = -EPERM;
+ return error;
+ }
+
+ error=-EPERM;
+ /* flag pending */
+ tsk->thread_info->inipending=1;
+ tsk->thread_info->inifinish=1;
+
+ /* set k_ia */
+ tsk->k_ia = ka;
+
+ smp_mb();
+
+ return 0;
+out:
+ return error;
+}
+
+/*
+ * Check inifinish.
+ */
+asmlinkage int sys_check_init(pid_t pid)
+{
+ struct task_struct *tsk;
+ int error;
+
+ /* if pid <= 1, parameter error */
+ error = -EPERM;
+ if (pid <= 1)
+ goto out;
+
+ /* Get task struct from pid.*/
+ error = -ESRCH;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid((pid_t)pid);
+ if(tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if(!tsk)
+ goto out;
+
+ /* capability check. */
+ error = -EPERM;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ /* Invalid user */
+ goto out;
+ }
+
+ /*
+ * Check inifinish in task struct.
+ * If it is 0, return 0, if it is -1, return -1, else return EAGAIN.
+ */
+ if(tsk->thread_info->inifinish==0){
+ return 0;
+ }else if(tsk->thread_info->inifinish==-1){
+ error = -EINVAL;
+ goto out;
+ }else{
+ error = -EAGAIN;
+ goto out;
+ }
+
+out:
+ return error;
+}
+
diff -urpN linux-2.6.11.7-vanilla/arch/x86_64/kernel/sys_x86_64.c linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/sys_x86_64.c
--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/sys_x86_64.c 2005-04-08 03:57:47.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/sys_x86_64.c 2005-04-18 11:05:42.000000000 +0900
@@ -16,6 +16,7 @@
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/personality.h>
+#include <linux/sched.h>

#include <asm/uaccess.h>
#include <asm/ipc.h>
@@ -66,6 +67,57 @@ out:
return error;
}

+/*
+ * Provide the mmap3 system call which maps the file to the specified process's memory.
+ */
+long sys_mmap3(void *arg)
+{
+ long error;
+ struct file * file;
+ mmap3_arg_struct_t a;
+ struct task_struct *tsk; // task_struct of process where plug will be loaded
+ // copy the struct in user space to kernel space
+ error = -EFAULT;
+ if (copy_from_user(&a, arg, sizeof(a)))
+ goto out;
+ error = -EINVAL;
+ if (a.pgoff & ~PAGE_MASK)
+ goto out;
+ error = -EBADF;
+ file = NULL;
+ a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+ if (!(a.flags & MAP_ANONYMOUS)) {
+ file = fget(a.fd);
+ if (!file)
+ goto out;
+ }
+ error = -ESRCH;
+ read_lock(&tasklist_lock); // lock tasklist
+ tsk = find_task_by_pid((pid_t)a.pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock); // unlock tasklist
+ if (!tsk)
+ goto out;
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in sys_accesspvm
+ return -EPERM;
+ }
+ // map the file to memory
+ down_write(&tsk->mm->mmap_sem);
+ error = (long)do_mmap_pgoff2(file, a.addr, a.len, a.prot, a.flags, a.pgoff >> PAGE_SHIFT, tsk);
+ up_write(&tsk->mm->mmap_sem);
+ if (file)
+ fput(file);
+out:
+ return error;
+}
+
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
@@ -142,6 +194,52 @@ full_search:
}
}

+unsigned long
+arch_get_unmapped_area2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, struct task_struct *tsk)
+{
+ struct mm_struct *mm = tsk->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+ unsigned long begin, end;
+
+ find_start_end(flags, &begin, &end);
+
+ if (len > end)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (end - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ addr = mm->free_area_cache;
+ if (addr < begin)
+ addr = begin;
+ start_addr = addr;
+
+full_search:
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ if (end - len < addr) {
+ if (start_addr != begin) {
+ start_addr = addr = begin;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ addr = vma->vm_end;
+ }
+}
+
+
+
asmlinkage long sys_uname(struct new_utsname __user * name)
{
int err;
diff -urpN linux-2.6.11.7-vanilla/include/asm-x86_64/exechandle.h linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/exechandle.h
--- linux-2.6.11.7-vanilla/include/asm-x86_64/exechandle.h 1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/exechandle.h 2005-04-18 10:45:47.000000000 +0900
@@ -0,0 +1,21 @@
+#ifndef _ASM_X86_64_EXECHANDLE_H
+#define _ASM_X86_64_EXECHANDLE_H
+
+#include <asm/types.h>
+#include <asm/signal.h>
+
+
+struct initaction
+{
+ void (*inithandler)(int);
+ void (*restorer)(void);
+};
+
+struct k_initaction
+{
+ struct initaction ia;
+};
+
+void do_init(struct pt_regs *regs, sigset_t *oldset);
+
+#endif
diff -urpN linux-2.6.11.7-vanilla/include/asm-x86_64/thread_info.h linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/thread_info.h
--- linux-2.6.11.7-vanilla/include/asm-x86_64/thread_info.h 2005-04-08 03:57:52.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/thread_info.h 2005-04-18 10:45:47.000000000 +0900
@@ -33,6 +33,9 @@ struct thread_info {

mm_segment_t addr_limit;
struct restart_block restart_block;
+
+ __u32 inipending; /* pending flags for live patch */
+ __u32 inifinish; /* finish flags for live patch */
};
#endif

diff -urpN linux-2.6.11.7-vanilla/include/asm-x86_64/unistd.h linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/unistd.h
--- linux-2.6.11.7-vanilla/include/asm-x86_64/unistd.h 2005-04-08 03:57:51.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/asm-x86_64/unistd.h 2005-04-18 10:45:47.000000000 +0900
@@ -563,8 +563,21 @@ __SYSCALL(__NR_add_key, sys_add_key)
__SYSCALL(__NR_request_key, sys_request_key)
#define __NR_keyctl 250
__SYSCALL(__NR_keyctl, sys_keyctl)
+#define __NR_mmap3 251
+__SYSCALL(__NR_mmap3, sys_mmap3)
+#define __NR_accesspvm 252
+__SYSCALL(__NR_accesspvm, sys_accesspvm)
+#define __NR_init_pend 253
+__SYSCALL(__NR_init_pend, sys_init_pend)
+#define __NR_rt_handlereturn 254
+__SYSCALL(__NR_rt_handlereturn, stub_rt_handlereturn)
+#define __NR_check_init 255
+__SYSCALL(__NR_check_init, sys_check_init)
+#define __NR_munmap3 256
+__SYSCALL(__NR_munmap3, sys_munmap3)

-#define __NR_syscall_max __NR_keyctl
+
+#define __NR_syscall_max __NR_munmap3
#ifndef __NO_STUBS

/* user-visible error numbers are in the range -1 - -4095 */
@@ -751,6 +764,7 @@ static inline pid_t waitpid(int pid, int
extern long sys_mmap(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off);
+extern long sys_mmap3(void *);

extern int sys_modify_ldt(int func, void *ptr, unsigned long bytecount);

diff -urpN linux-2.6.11.7-vanilla/include/linux/capability.h linux-2.6.11.7-pannus-x86_64/include/linux/capability.h
--- linux-2.6.11.7-vanilla/include/linux/capability.h 2005-04-08 03:57:26.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/linux/capability.h 2005-04-18 10:45:47.000000000 +0900
@@ -288,6 +288,10 @@ typedef __u32 kernel_cap_t;

#define CAP_AUDIT_CONTROL 30

+/* Allow use of memory access system calls for Live Patching */
+
+#define CAP_SYS_PANNUS 31
+
#ifdef __KERNEL__
/*
* Bounding set
diff -urpN linux-2.6.11.7-vanilla/include/linux/mm.h linux-2.6.11.7-pannus-x86_64/include/linux/mm.h
--- linux-2.6.11.7-vanilla/include/linux/mm.h 2005-04-08 03:57:09.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/linux/mm.h 2005-04-18 10:45:47.000000000 +0900
@@ -614,6 +614,7 @@ extern int install_page(struct mm_struct
extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
+extern int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);

@@ -730,10 +731,16 @@ extern void exit_mmap(struct mm_struct *

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

+extern unsigned long get_unmapped_area2(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, struct task_struct *);
+
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff);

+extern unsigned long do_mmap_pgoff2(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flag, unsigned long pgoff, struct task_struct *);
+
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
diff -urpN linux-2.6.11.7-vanilla/include/linux/mman.h linux-2.6.11.7-pannus-x86_64/include/linux/mman.h
--- linux-2.6.11.7-vanilla/include/linux/mman.h 2005-04-08 03:57:13.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/linux/mman.h 2005-04-18 10:45:47.000000000 +0900
@@ -64,4 +64,17 @@ calc_vm_flag_bits(unsigned long flags)
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
}

+/*
+ * Struct of argument to pass the mmap3 system call.
+ */
+typedef struct _mmap3_arg_struct {
+ unsigned long addr; /* address where file is loaded */
+ unsigned long len; /* length of data to be maped */
+ unsigned long prot; /* permission of the memory where the file is mapped */
+ unsigned long flags; /* flag of mapped memory */
+ unsigned long fd; /* file descriptor of data to be mapped */
+ unsigned long pgoff; /* page offset of data to be mapped */
+ unsigned long pid; /* process ID */
+} mmap3_arg_struct_t;
+
#endif /* _LINUX_MMAN_H */
diff -urpN linux-2.6.11.7-vanilla/include/linux/sched.h linux-2.6.11.7-pannus-x86_64/include/linux/sched.h
--- linux-2.6.11.7-vanilla/include/linux/sched.h 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/include/linux/sched.h 2005-04-18 10:45:47.000000000 +0900
@@ -21,6 +21,7 @@
#include <asm/ptrace.h>
#include <asm/mmu.h>
#include <asm/cputime.h>
+#include <asm/exechandle.h>

#include <linux/smp.h>
#include <linux/sem.h>
@@ -197,9 +198,19 @@ extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
+arch_get_unmapped_area2(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long, struct task_struct *);
+
+extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
+
+extern unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, struct task_struct *);
+
extern void arch_unmap_area(struct vm_area_struct *area);
extern void arch_unmap_area_topdown(struct vm_area_struct *area);

@@ -211,6 +222,11 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
+ unsigned long (*get_unmapped_area2) (struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags,
+ struct task_struct * tsk);
+
void (*unmap_area) (struct vm_area_struct *area);
unsigned long mmap_base; /* base of mmap area */
unsigned long free_area_cache; /* first hole */
@@ -685,6 +701,7 @@ struct task_struct {
struct mempolicy *mempolicy;
short il_next;
#endif
+ struct k_initaction k_ia; /*Inialization info for live patch */
};

static inline pid_t process_group(struct task_struct *tsk)
@@ -1173,6 +1190,7 @@ static inline void arch_pick_mmap_layout
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
+ mm->get_unmapped_area2 = arch_get_unmapped_area2;
mm->unmap_area = arch_unmap_area;
}
#endif
diff -urpN linux-2.6.11.7-vanilla/kernel/fork.c linux-2.6.11.7-pannus-x86_64/kernel/fork.c
--- linux-2.6.11.7-vanilla/kernel/fork.c 2005-04-08 03:57:12.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/kernel/fork.c 2005-04-18 10:45:47.000000000 +0900
@@ -2,6 +2,7 @@
* linux/kernel/fork.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2004-2005 NTT Corporation
*/

/*
@@ -412,6 +413,12 @@ void mm_release(struct task_struct *tsk,
u32 __user * tidptr = tsk->clear_child_tid;
tsk->clear_child_tid = NULL;

+ /* initialize flag and information for live patch */
+ tsk->thread_info->inipending=0;
+ tsk->thread_info->inifinish=0;
+ tsk->k_ia.ia.inithandler=NULL;
+ tsk->k_ia.ia.restorer=NULL;
+
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
diff -urpN linux-2.6.11.7-vanilla/mm/memory.c linux-2.6.11.7-pannus-x86_64/mm/memory.c
--- linux-2.6.11.7-vanilla/mm/memory.c 2005-04-08 03:57:36.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/mm/memory.c 2005-04-18 10:45:47.000000000 +0900
@@ -2209,6 +2209,27 @@ int make_pages_present(unsigned long add
return ret == len ? 0 : -1;
}

+int make_pages_present2(unsigned long addr, unsigned long end, struct task_struct *tsk)
+{
+ int ret, len, write;
+ struct vm_area_struct * vma;
+
+ vma = find_vma(tsk->mm, addr);
+ if (!vma)
+ return -1;
+ write = (vma->vm_flags & VM_WRITE) != 0;
+ if (addr >= end)
+ BUG();
+ if (end > vma->vm_end)
+ BUG();
+ len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+ ret = get_user_pages(tsk, tsk->mm, addr,
+ len, write, 0, NULL, NULL);
+ if (ret < 0)
+ return ret;
+ return ret == len ? 0 : -1;
+}
+
/*
* Map a vmalloc()-space virtual address to the physical page.
*/
diff -urpN linux-2.6.11.7-vanilla/mm/mmap.c linux-2.6.11.7-pannus-x86_64/mm/mmap.c
--- linux-2.6.11.7-vanilla/mm/mmap.c 2005-04-08 03:57:45.000000000 +0900
+++ linux-2.6.11.7-pannus-x86_64/mm/mmap.c 2005-04-18 11:04:40.000000000 +0900
@@ -1143,6 +1143,239 @@ unacct_error:

EXPORT_SYMBOL(do_mmap_pgoff);

+/*
+ * map the data which have the length specified in the file to the memory of
+ * the specified task.(clone of do_mmap_pgoff)
+ */
+
+unsigned long do_mmap_pgoff2(struct file * file, unsigned long addr,
+ unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff, struct task_struct *tsk)
+{
+ struct mm_struct * mm = tsk->mm;
+ struct vm_area_struct * vma, * prev;
+ struct inode *inode;
+ unsigned int vm_flags;
+ int correct_wcount = 0;
+ int error;
+ struct rb_node ** rb_link, * rb_parent;
+ int accountable = 1;
+ unsigned long charged = 0;
+
+ if (file) {
+ if (is_file_hugepages(file))
+ accountable = 0;
+
+ if (!file->f_op || !file->f_op->mmap)
+ return -ENODEV;
+
+ if ((prot & PROT_EXEC) &&
+ (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))
+ return -EPERM;
+ }
+
+ if ((prot & PROT_READ) && (tsk->personality & READ_IMPLIES_EXEC))
+ if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)))
+ prot |= PROT_EXEC;
+ if (!len)
+ return addr;
+
+ len = PAGE_ALIGN(len);
+ if (!len || len > TASK_SIZE)
+ return -EINVAL;
+
+ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+ return -EINVAL;
+
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+ addr = get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+ if (addr & ~PAGE_MASK)
+ return addr;
+
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (flags & MAP_LOCKED) {
+ if (!can_do_mlock())
+ return -EPERM;
+ vm_flags |= VM_LOCKED;
+ }
+ if (vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = tsk->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+
+ inode = file ? file->f_dentry->d_inode : NULL;
+
+ if (file) {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
+ return -EACCES;
+
+ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
+ return -EACCES;
+
+ if (locks_verify_locked(inode))
+ return -EAGAIN;
+
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ if (!(file->f_mode & FMODE_WRITE))
+ vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
+
+ case MAP_PRIVATE:
+ if (!(file->f_mode & FMODE_READ))
+ return -EACCES;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ } else {
+ switch (flags & MAP_TYPE) {
+ case MAP_SHARED:
+ vm_flags |= VM_SHARED | VM_MAYSHARE;
+ break;
+ case MAP_PRIVATE:
+ pgoff = addr >> PAGE_SHIFT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ error = security_file_mmap(file, prot, flags);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+munmap_back:
+ vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (vma && vma->vm_start < addr + len) {
+ if (do_munmap(mm, addr, len))
+ return -ENOMEM;
+ goto munmap_back;
+ }
+ if ((mm->total_vm << PAGE_SHIFT) + len
+ > tsk->signal->rlim[RLIMIT_AS].rlim_cur)
+ return -ENOMEM;
+
+ if (accountable && (!(flags & MAP_NORESERVE) ||
+ sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+ if (vm_flags & VM_SHARED) {
+ vm_flags |= VM_ACCOUNT;
+ } else if (vm_flags & VM_WRITE) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+ }
+
+ if (!file && !(vm_flags & VM_SHARED) &&
+ vma_merge(mm, prev, addr, addr + len, vm_flags,
+ NULL, NULL, pgoff, NULL))
+ goto out;
+
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+ memset(vma, 0, sizeof(*vma));
+
+ vma->vm_mm = mm;
+ vma->vm_start = addr;
+ vma->vm_end = addr + len;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ error = -EINVAL;
+ if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+ goto free_vma;
+ if (vm_flags & VM_DENYWRITE) {
+ error = deny_write_access(file);
+ if (error)
+ goto free_vma;
+ correct_wcount = 1;
+ }
+ vma->vm_file = file;
+ get_file(file);
+ error = file->f_op->mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ }
+
+ if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
+ vma->vm_flags &= ~VM_ACCOUNT;
+
+
+ addr = vma->vm_start;
+ pgoff = vma->vm_pgoff;
+ vm_flags = vma->vm_flags;
+
+ if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+ vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+ file = vma->vm_file;
+ vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ } else {
+ if (file) {
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ fput(file);
+ }
+ mpol_free(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+ }
+
+out:
+ mm->total_vm += len >> PAGE_SHIFT;
+ __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += len >> PAGE_SHIFT;
+ make_pages_present2(addr, addr + len, tsk);
+ }
+ if (flags & MAP_POPULATE) {
+ up_write(&mm->mmap_sem);
+ sys_remap_file_pages(addr, len, 0,
+ pgoff, flags & MAP_NONBLOCK);
+ down_write(&mm->mmap_sem);
+ }
+ acct_update_integrals();
+ update_mem_hiwater();
+ return addr;
+
+unmap_and_free_vma:
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ vma->vm_file = NULL;
+ fput(file);
+
+ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+free_vma:
+ kmem_cache_free(vm_area_cachep, vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+}
+EXPORT_SYMBOL(do_mmap_pgoff2);
+
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1199,6 +1432,48 @@ full_search:
addr = vma->vm_end;
}
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area)
+ */
+unsigned long
+arch_get_unmapped_area2(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+
+ struct mm_struct *mm = tsk->mm;
+ struct vm_area_struct *vma;
+ unsigned long start_addr;
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+ start_addr = addr = mm->free_area_cache;
+
+full_search:
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ if (TASK_SIZE - len < addr) {
+ if (start_addr != TASK_UNMAPPED_BASE) {
+ start_addr = addr = TASK_UNMAPPED_BASE;
+ goto full_search;
+ }
+ return -ENOMEM;
+ }
+ if (!vma || addr + len <= vma->vm_start) {
+ mm->free_area_cache = addr + len;
+ return addr;
+ }
+ addr = vma->vm_end;
+ }
+}
+
#endif

void arch_unmap_area(struct vm_area_struct *area)
@@ -1300,6 +1575,66 @@ fail:

return addr;
}
+
+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of arch_get_unmapped_area_topdown)
+ */
+unsigned long
+arch_get_unmapped_area_topdown2(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags, struct task_struct *tsk)
+{
+ struct vm_area_struct *vma, *prev_vma;
+ struct mm_struct *mm = tsk->mm;
+ unsigned long base = mm->mmap_base, addr = addr0;
+ int first_time = 1;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (mm->free_area_cache > base)
+ mm->free_area_cache = base;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+try_again:
+ if (mm->free_area_cache < len)
+ goto fail;
+
+ addr = (mm->free_area_cache - len) & PAGE_MASK;
+ do {
+ if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
+ return addr;
+
+ if (addr+len <= vma->vm_start &&
+ (!prev_vma || (addr >= prev_vma->vm_end)))
+ return (mm->free_area_cache = addr);
+ else
+ if (mm->free_area_cache == vma->vm_end)
+ mm->free_area_cache = vma->vm_start;
+
+ addr = vma->vm_start-len;
+ } while (len <= vma->vm_start);
+
+fail:
+ if (first_time) {
+ mm->free_area_cache = base;
+ first_time = 0;
+ goto try_again;
+ }
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ addr = arch_get_unmapped_area2(filp, addr0, len, pgoff, flags, tsk);
+ mm->free_area_cache = base;
+ return addr;
+}
+
#endif

void arch_unmap_area_topdown(struct vm_area_struct *area)
@@ -1350,6 +1685,35 @@ get_unmapped_area(struct file *file, uns

EXPORT_SYMBOL(get_unmapped_area);

+/*
+ * Get the area in the specific process where nothing is mapped.
+ * (clone of get_unmapped_area)
+ */
+unsigned long
+get_unmapped_area2(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, struct task_struct *tsk)
+{
+ if (flags & MAP_FIXED) {
+ unsigned long ret;
+
+ if (addr > TASK_SIZE - len)
+ return -ENOMEM;
+ if (addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (file && is_file_hugepages(file)) {
+ ret = prepare_hugepage_range(addr, len);
+ } else {
+ ret = is_hugepage_only_range(addr, len);
+ }
+ if (ret)
+ return -EINVAL;
+ return addr;
+ }
+ return tsk->mm->get_unmapped_area2(file, addr, len, pgoff, flags, tsk);
+}
+
+EXPORT_SYMBOL(get_unmapped_area2);
+
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
{
@@ -1878,6 +2242,49 @@ static inline void verify_mm_writelocked
#endif
}

+
+
+/*
+ * Clear the specified mapped area in specified process.
+ * Provide the system call munmap3.
+ * Send memory map information struct to do_munmap.
+ */
+asmlinkage long sys_munmap3(unsigned long addr, size_t len, pid_t pid)
+{
+ int ret;
+ struct mm_struct *mm;
+
+ /* target process task struct */
+ struct task_struct *tsk;
+
+ /* get specified process task struct from pid.*/
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ read_unlock(&tasklist_lock);
+
+ if (!tsk)
+ return -ESRCH;
+
+ // capability check
+ if(((current->uid != tsk->euid) ||
+ (current->uid != tsk->suid) ||
+ (current->uid != tsk->uid) ||
+ (current->gid != tsk->egid) ||
+ (current->gid != tsk->sgid) ||
+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
+ // invalid user in munamp3
+ // EPERM:1 Operation not permitted
+ return -EPERM;
+ }
+
+
+ mm = tsk->mm;
+ down_write(&mm->mmap_sem);
+ ret = do_munmap(mm, addr, len);
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some


Attachments:
pannus-2.6.11.7-x86_64.patch (41.96 kB)

2005-04-18 04:07:27

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 12:19:54PM +0900, Takashi Ikebe wrote:

> This patch add function called "Live patching" which is defined on
> OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7
> kernel.

I;m curious as to what people decided this was a necessary
requirement.

> The live patching allows process to patch on-line (without
> restarting process) on i386 and x86_64 architectures, by overwriting
> jump assembly code on entry point of functions which you want to
> fix, to patched functions.

Why can't you use ptrace for all this?

2005-04-18 04:21:57

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hello,

Chris Wedgwood wrote:
> On Mon, Apr 18, 2005 at 12:19:54PM +0900, Takashi Ikebe wrote:
>
>>This patch add function called "Live patching" which is defined on
>>OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7
>>kernel.
> I;m curious as to what people decided this was a necessary
> requirement.

The requirements are comes from Network Equipment Providers, Telecom
Carriers, and Hardware Vendors,
You can see the attendee from below link;
http://groups.osdl.org/world_map/full_roster/

>>The live patching allows process to patch on-line (without
>>restarting process) on i386 and x86_64 architectures, by overwriting
>>jump assembly code on entry point of functions which you want to
>>fix, to patched functions.
> Why can't you use ptrace for all this?

GDB based approach seems not fit to our requirements. GDB(ptrace) based
functions are basically need to be done when target process is stopping.
In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
*word* size...
From our experience, sometimes patches became to dozens to hundreds at
one patching, and in this case GDB based approach cause target process's
availability descent.

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]

2005-04-18 04:42:39

by Daniel Jacobowitz

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
> GDB based approach seems not fit to our requirements. GDB(ptrace) based
> functions are basically need to be done when target process is stopping.
> In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
> *word* size...

While true, this is easily fixable. There is even an interface
precedent on OpenBSD (and possibly other platforms as well).

--
Daniel Jacobowitz
CodeSourcery, LLC

2005-04-18 04:55:44

by Nicholas Miell

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 2005-04-18 at 00:42 -0400, Daniel Jacobowitz wrote:
> On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
> > GDB based approach seems not fit to our requirements. GDB(ptrace) based
> > functions are basically need to be done when target process is stopping.
> > In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
> > *word* size...
>
> While true, this is easily fixable. There is even an interface
> precedent on OpenBSD (and possibly other platforms as well).
>

If we're going to be stealing ideas for debugging interfaces from other
operating systems, could we steal from Solaris instead of anything
ptrace-based?

--
Nicholas Miell <[email protected]>

2005-04-18 05:01:56

by Davide Libenzi

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 2005-04-18 at 00:42 -0400, Daniel Jacobowitz wrote:

> On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
> > GDB based approach seems not fit to our requirements. GDB(ptrace) based
> > functions are basically need to be done when target process is stopping.
> > In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
> > *word* size...
>
> While true, this is easily fixable.

Indeed, look at the systr_pmem_read() and systr_pmem_write() functions:

http://www.xmailserver.org/sysctr.html


- Davide

2005-04-18 05:05:56

by David Miller

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 18 Apr 2005 00:42:23 -0400
Daniel Jacobowitz <[email protected]> wrote:

> On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
> > GDB based approach seems not fit to our requirements. GDB(ptrace) based
> > functions are basically need to be done when target process is stopping.
> > In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
> > *word* size...
>
> While true, this is easily fixable. There is even an interface
> precedent on OpenBSD (and possibly other platforms as well).

Some platforms even support the necessary PTRADE_{READ,WRITE}DATA
operations already, sparc is one such platform.

2005-04-18 05:41:54

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Davide Libenzi wrote:

>On Mon, 2005-04-18 at 00:42 -0400, Daniel Jacobowitz wrote:
>
>
>
>>On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
>>
>>
>>>GDB based approach seems not fit to our requirements. GDB(ptrace) based
>>>functions are basically need to be done when target process is stopping.
>>>In addition to that current PTRACE_PEEK/POKE* allows us to copy only a
>>>*word* size...
>>>
>>>
>>While true, this is easily fixable.
>>
>>
>
>Indeed, look at the systr_pmem_read() and systr_pmem_write() functions:
>
>http://www.xmailserver.org/sysctr.html
>
>
>
systr_pmem_read() and systr_pmem_write() just calls ptrace PTRACE_PEEKTEXT/DATA repeatedly....
In this case we need to *stop* target process whenever patch modules is loading....
It cause target process availability worse.

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-18 06:12:27

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:

> From our experience, sometimes patches became to dozens to hundreds
> at one patching, and in this case GDB based approach cause target
> process's availability descent.

i don't really buy that it can't be done or you complex patches are
necessary to be honest --- and there are various alternative APIs that
could help as others have pointed out


could you perhaps explain some *real* *world* applications/systems
where this is necessary and why existing APIs won't work with them
perhaps?

solving a real-world problem is much more interesting to listen to
that filling in a check-box on a (somewhat dubious) specification

2005-04-18 06:35:42

by Chris Friesen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Chris Wedgwood wrote:
> On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
>
>
>>From our experience, sometimes patches became to dozens to hundreds
>>at one patching, and in this case GDB based approach cause target
>>process's availability descent.

> could you perhaps explain some *real* *world* applications/systems
> where this is necessary and why existing APIs won't work with them
> perhaps?

In the telecom space it's quite common to want to modify multiple
running binaries with as little downtime as possible. (Beyond a
threshold it becomes FCC-reportable in the US, and everyone wants to
avoid that...)

Our old proprietary OS had explicit support for replacing running binary
code on the fly, so customers have gotten used to the ability. Now they
want equivalent functionality with our linux-based stuff.

We've done some proprietary stuff (ie. pre-OSDL CGL) in this area, but
it was apparently a real pain and was quite restrictive on the
application writers. (I was not involved with that portion of the project.)

For general application support I suspect some kernel support will be
required. Whether this is the way to go or whether it can be done using
existing mechanisms, I'm not knowledgeable enough to comment.

Chris

2005-04-18 06:48:34

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 12:35:04AM -0600, Chris Friesen wrote:

> In the telecom space it's quite common to want to modify multiple
> running binaries with as little downtime as possible.

OK

> (Beyond a threshold it becomes FCC-reportable in the US, and
> everyone wants to avoid that...)

That's beside the point.

> Our old proprietary OS had explicit support for replacing running
> binary code on the fly, so customers have gotten used to the
> ability. Now they want equivalent functionality with our
> linux-based stuff.

*Why* do they need this is what I asked. A sensible real world
example would be useful.

> For general application support I suspect some kernel support will
> be required. Whether this is the way to go or whether it can be
> done using existing mechanisms, I'm not knowledgeable enough to
> comment.

I used to work in telco space, we had some such systems and similar
things. Some from Nortel even.

None of the things I saw did anything that I can image really need a
complicated kernel patch for.


In fact, I'm not convinced *any* of these uses really needed
live-patching at all.


I would just like some examples of real-world needs and an explanation
of why it's needed. Not handy-waving.

2005-04-18 07:32:34

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

For example, please think about telephone switching software.
The software does not allow to stops over 100 milliseconds at worst
case.(this depends on how many customers are using, the more customers
are using, the allowable stopping time goes to shorter.)
And the system is required over 99.999% availability, so typically, the
system is constructed as ACT-SBY high-availability clustering model.
In the case of critical error the system takeover the service process to
SBY node.
Not to descent the service availability, software fix due to bug, should
not stop the service, and live patching is very historical function in
telecoms world.
Every carrier, NEPs(Network Equipment Provider) provide/use this
function to keep network service(such as telephone) available.
This function is very essential whenever the carrier use the linux as
center of it's system.

Therefore the live patching function should not stop the target
process(service process) as possible as. the more times we stop the
target process, the service goes unavailable...


Chris Wedgwood wrote:

>On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
>
>
>
>>From our experience, sometimes patches became to dozens to hundreds
>>at one patching, and in this case GDB based approach cause target
>>process's availability descent.
>>
>>
>
>i don't really buy that it can't be done or you complex patches are
>necessary to be honest --- and there are various alternative APIs that
>could help as others have pointed out
>
>
>could you perhaps explain some *real* *world* applications/systems
>where this is necessary and why existing APIs won't work with them
>perhaps?
>
>solving a real-world problem is much more interesting to listen to
>that filling in a check-box on a (somewhat dubious) specification
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-18 07:56:55

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 04:32:21PM +0900, Takashi Ikebe wrote:

> The software does not allow to stops over 100 milliseconds at worst
> case.

Out of interest, how do you ensure the process doesn't stop for that
long right now? Linux doesn't guarantee you'll get scheduled
(strictly speaking) in n milliseconds usually.

> Not to descent the service availability, software fix due to bug,
> should not stop the service, and live patching is very historical
> function in telecoms world.

Lots of really complicated and unnecessary things are common in the
telecoms world.

For the example you gave I can think of several ways to migrate data
to a new process (if need be) in a timely manner without interruption.
None of these *require* live patching.

> Every carrier, NEPs(Network Equipment Provider) provide/use this
> function to keep network service (such as telephone) available.

How does this *require* live patching?

> This function is very essential whenever the carrier use the linux
> as center of it's system.

Those are just marketing words.

> Therefore the live patching function should not stop the target
> process (service process) as possible as. the more times we stop the
> target process, the service goes unavailable...

Love patching seems like a very complicated thing to get right and it
could potentially blow up.

I'm guessing any suggestion of fixing the applications behavior would
be lost with some argument along the lines of: "this application was
written in 1824 by Ada Lovelace using pre-Roswell Alien Technology and
was certified NEBS compliant by the Deli Lama and god herself, so
clearly we can't touch a single line of it" or similar right?

2005-04-18 08:37:35

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

I'm sorry, I can not catch the point well,
but what I want to say is, we do not want stop the service due to bug fix.

As you said, if we can migrate the data to new process without stopping
service, it is OK, but the real applications need to takeover data very
much(sometimes it's over gigabyte....depends on service, and causes
service disruption...). So, live patching seems reasonable to us.
And unfortunately the APIs which people suggested are all based on
ptrace system calls. as you know, ptrace based data read/write needs to
stop the target process during read/write. We would like to minimize
target process stopping time.

>I'm guessing any suggestion of fixing the applications behavior would
>be lost with some argument along the lines of: "this application was
>written in 1824 by Ada Lovelace using pre-Roswell Alien Technology and
>was certified NEBS compliant by the Deli Lama and god herself, so
>clearly we can't touch a single line of it" or similar right?
well, it's possibly, but the same problems occur on gdb.
I think this depends on user's manner...

I briefly describe the way of live patching below;
1. Load the patch modules with pannus -l command.
- load the patch module with first memory mapping system call(ptrace
PEEKDATA can be same work, but it needs to stop target process..)
- search patch module's initialize area and execute them with
execinit.c(similler to signal handler)
- target process exec initialization.
2. Activate the patch modules with pannus -a command.
- stop the target process and check current instruction not to conflict.
- if it is not conflict, overwrite the jump assembly to function's
entrypoiny where you want to fix, to patch module's one.
- restart the process.

Will this be answer??

Chris Wedgwood wrote:

>On Mon, Apr 18, 2005 at 04:32:21PM +0900, Takashi Ikebe wrote:
>
>
>
>>The software does not allow to stops over 100 milliseconds at worst
>>case.
>>
>>
>
>Out of interest, how do you ensure the process doesn't stop for that
>long right now? Linux doesn't guarantee you'll get scheduled
>(strictly speaking) in n milliseconds usually.
>
>
>
>>Not to descent the service availability, software fix due to bug,
>>should not stop the service, and live patching is very historical
>>function in telecoms world.
>>
>>
>
>Lots of really complicated and unnecessary things are common in the
>telecoms world.
>
>For the example you gave I can think of several ways to migrate data
>to a new process (if need be) in a timely manner without interruption.
>None of these *require* live patching.
>
>
>
>>Every carrier, NEPs(Network Equipment Provider) provide/use this
>>function to keep network service (such as telephone) available.
>>
>>
>
>How does this *require* live patching?
>
>
>
>>This function is very essential whenever the carrier use the linux
>>as center of it's system.
>>
>>
>
>Those are just marketing words.
>
>
>
>>Therefore the live patching function should not stop the target
>>process (service process) as possible as. the more times we stop the
>>target process, the service goes unavailable...
>>
>>
>
>Love patching seems like a very complicated thing to get right and it
>could potentially blow up.
>
>I'm guessing any suggestion of fixing the applications behavior would
>be lost with some argument along the lines of: "this application was
>written in 1824 by Ada Lovelace using pre-Roswell Alien Technology and
>was certified NEBS compliant by the Deli Lama and god herself, so
>clearly we can't touch a single line of it" or similar right?
>
>


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-18 09:01:06

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 05:37:09PM +0900, Takashi Ikebe wrote:

> As you said, if we can migrate the data to new process without
> stopping service, it is OK, but the real applications need to
> takeover data very much(sometimes it's over gigabyte....depends on
> service, and causes service disruption...).

man mmap
man 5 ipc

> So, live patching seems reasonable to us.

That still doesn't tell me why it's necessary to do something so
complicated

> 2. Activate the patch modules with pannus -a command.
> - stop the target process and check current instruction not to conflict.
> - if it is not conflict, overwrite the jump assembly to function's
> entrypoiny where you want to fix, to patch module's one.
> - restart the process.

there is a still a stop/start here

why not just hand the state of to a different process? how is that
slower?

> Will this be answer??

maybe, but i'm far from convinced it's necessary and therefore
warrants a big ugly kernel patch

2005-04-18 09:06:23

by James Courtier-Dutton

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Chris Friesen wrote:
> Chris Wedgwood wrote:
>
>> On Mon, Apr 18, 2005 at 01:19:57PM +0900, Takashi Ikebe wrote:
>>
>>
>>> From our experience, sometimes patches became to dozens to hundreds
>>> at one patching, and in this case GDB based approach cause target
>>> process's availability descent.
>
>
>> could you perhaps explain some *real* *world* applications/systems
>> where this is necessary and why existing APIs won't work with them
>> perhaps?
>
>
> In the telecom space it's quite common to want to modify multiple
> running binaries with as little downtime as possible. (Beyond a
> threshold it becomes FCC-reportable in the US, and everyone wants to
> avoid that...)
>
> Our old proprietary OS had explicit support for replacing running binary
> code on the fly, so customers have gotten used to the ability. Now they
> want equivalent functionality with our linux-based stuff.
>
> We've done some proprietary stuff (ie. pre-OSDL CGL) in this area, but
> it was apparently a real pain and was quite restrictive on the
> application writers. (I was not involved with that portion of the project.)
>
> For general application support I suspect some kernel support will be
> required. Whether this is the way to go or whether it can be done using
> existing mechanisms, I'm not knowledgeable enough to comment.
>
> Chris
> -

I raised a thread like this about 1 year ago. I was asking for it from
the point of view of a Telco. After some discussions on this list, I
came to agree with the posts on the list by other people that the
feature is not needed. At least certainly not needed in the Telco space.
99.999% uptime is much better acheived with the use of clustering,
rather than trying to upgrade software in a Live situation. In a
clustered environment, one offloads all the tasks from machine A and
spread them across the cluster. Once the machine A is not doing any work
at all, you can upgrade, reboot, whatever you like, and then add it back
to the cluster. This approach is much less risky than live module updates.
If the equipment is not clustered, it will at least be 2 to 1 redundent,
so you just upgrade the redundent device, manually force a fail over,
and then upgrade the other device. Again, no live update required.

I can only think of one other system that might benefit from live
updates, and that is set top boxes, so bugs can be fixed without the
user knowing. This also can be worked around by downloading the bug
fixes and only installing the bugs fixes when the user is not viewing
the TV. E.g. When the box has been placed in standby by the user.

James

2005-04-18 09:13:35

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 11:03:39AM +0100, James Courtier-Dutton wrote:

> I can only think of one other system that might benefit from live
> updates, and that is set top boxes, so bugs can be fixed without the
> user knowing.

hardly mission critical and usually don't have the resources to do
complicate things

much better to rexec/reboot as needed

> This also can be worked around by downloading the bug fixes and only
> installing the bugs fixes when the user is not viewing the
> TV. E.g. When the box has been placed in standby by the user.

again it doesn't need live patching (satellite and cable boxes update
themselves routinely, some certainly do need to periodically reboot to
do this and it's apparently not a problem)

2005-04-18 09:19:10

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Chris wrote:
> Linux doesn't guarantee you'll get scheduled
> (strictly speaking) in n milliseconds usually.

The general case load doesn't apply here. Those doing call switching
know what they have running, and know that it won't give the scheduler
any opportunities to not run what must be run, in time. Given a
sufficiently short run queue, the scheduler is quite predictable.

And there is difference between the entire system being down for over
100 milliseconds, and a given call being delayed that long. Under
sufficient load (busiest hour on Mothers Day, say), the system must
continue to operate, though some switching may be delayed longer than
normal, though still within specified limits.

> > This function is very essential whenever ...
>
> Those are just marketing words.
> ...
> I'm guessing any suggestion of fixing the applications behavior would
> be lost with some argument along the lines of ...

The call switching folks have been doing live patching at least since I
worked on it, over 25 years ago. This is not just marketing.

No sense in being disrespectful to Takashi-san. This patch may or
may not be the best way to provide the functionality they require.
I don't even know if a kernel patch is needed.

But the tone of this thread won't lead anyone to better answers
anytime soon.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.650.933.1373, 1.925.600.0401

2005-04-18 09:25:42

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 02:16:09AM -0700, Paul Jackson wrote:

> The call switching folks have been doing live patching at least
> since I worked on it, over 25 years ago. This is not just
> marketing.

That still doesn't explain *why* live patching is needed.

2005-04-18 10:59:58

by Bodo Eggert

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Takashi Ikebe <[email protected]> wrote:

> systr_pmem_read() and systr_pmem_write() just calls ptrace
> PTRACE_PEEKTEXT/DATA repeatedly.... In this case we need to *stop* target
> process whenever patch modules is loading....

You'll have to do that anyway, since you'll need to atomically store two
machine words. At least you'll have to lock access to the corresponding
memory page(s).
--
Error, no keyboard -- press F1 to continue.

Fri?, Spammer: [email protected] [email protected]
[email protected] [email protected] [email protected]

2005-04-18 11:30:48

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 18 Apr 2005, Chris Wedgwood wrote:
> On Mon, Apr 18, 2005 at 02:16:09AM -0700, Paul Jackson wrote:
>
> > The call switching folks have been doing live patching at least
> > since I worked on it, over 25 years ago. This is not just
> > marketing.
>
> That still doesn't explain *why* live patching is needed.

I suspect it was needed in the past, on embedded computers so
small they could only run one program at a time.

I see no reason why changing programs on the fly couldn't be
done nicer with SHM segments today - just start up the new
program in parallel with the old one, have it attach to the
SHM region and handshake with the old program to take over
operations.

At that point the old program can let go of file descriptors
(eg. those to devices), yield the CPU and the new program can
open those file descriptors. The SHM area contains all of the
state information needed, so the program can continue running
like it always would.

This may well be lower latency than live patching, and probably
lower complexity/risk too...

--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

2005-04-18 12:55:23

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hello,
Rik van Riel wrote:
> On Mon, 18 Apr 2005, Chris Wedgwood wrote:
>
>>On Mon, Apr 18, 2005 at 02:16:09AM -0700, Paul Jackson wrote:
>>
>>
>>>The call switching folks have been doing live patching at least
>>>since I worked on it, over 25 years ago. This is not just
>>>marketing.
>>
>>That still doesn't explain *why* live patching is needed.
>
>
> I suspect it was needed in the past, on embedded computers so
> small they could only run one program at a time.
>
> I see no reason why changing programs on the fly couldn't be
> done nicer with SHM segments today - just start up the new
> program in parallel with the old one, have it attach to the
> SHM region and handshake with the old program to take over
> operations.
>
> At that point the old program can let go of file descriptors
> (eg. those to devices), yield the CPU and the new program can
> open those file descriptors. The SHM area contains all of the
> state information needed, so the program can continue running
> like it always would.
>
> This may well be lower latency than live patching, and probably
> lower complexity/risk too...
>

I think most important thing for carrier system is service availability.
The live patch only stops process(which have 3 threads) 180 nanoseconds
with 2functions, 2 variable changes on my linux desktop(Xeon 2.8G dual).
(on sample 1)

I believe process status copy consume more time, may be below sequences
are needed;
- Stop the service on ACT-process.
- Copy on memory/on transaction status to shared memory.
- Takeover shared memory key to SBY process and release the shared memory
- SBY process access to shared memory.
- SBY process checks the memory and reset broken sessions.
- SBY process restart the service.

Some part may be parallelize, but seems the more data make service
disruption time longer...(It seems exceeds 100 milliseconds depends on
data size..)
and process will be more complicated....makes more bugs...


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]

2005-04-18 14:07:13

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 18 Apr 2005, Takashi Ikebe wrote:

> I believe process status copy consume more time, may be below sequences are
> needed;
> - Stop the service on ACT-process.
> - Copy on memory/on transaction status to shared memory.

No need for this, the process could ALWAYS store its
status in a shared memory status. This is just as
fast as private memory, only more flexible.

> - Takeover shared memory key to SBY process and release the shared memory
> - SBY process access to shared memory.

Which means the SBY process can attach to the shared
memory region while the ACT process is running. It
can then communicate with the ACT process through a
socket ...

> - SBY process checks the memory and reset broken sessions.
> - SBY process restart the service.

... and the SBY process can take over immediately.
The state machine running the SBY software can
continue using the same data structures the ACT
process was using beforehand, since they're in a
shared memory region.

> Some part may be parallelize, but seems the more data make service
> disruption time longer...(It seems exceeds 100 milliseconds depends on
> data size..) and process will be more complicated....makes more bugs...

The data size should not be an issue, since the primary
copy of the state is in the shared memory area.

The state machine in the SBY process can directly run
using those data structures, so no copying is needed.

The only overhead will be inter-process communication,
having the first process close file descriptors, yielding
the CPU to the second process, which then opens up the
devices again. We both know how long a context switch
and an open() syscall take - negligable.

The old version of the program can shut itself down
after it knows the new version has taken over - in the
background, without disrupting the now active process.

--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

2005-04-18 14:30:23

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

> That still doesn't explain *why* live patching is needed.

True enough.

When a requirement is based in 30 years of tradition and practice, it
takes work to to back it up to the essentials that would distinguish
accurately between adequate and inadequate alternatives. And that I
presume is what you mean by the emphasized *why*.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.650.933.1373, 1.925.600.0401

2005-04-19 02:15:06

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Rik van Riel wrote:

>On Mon, 18 Apr 2005, Takashi Ikebe wrote:
>
>
>
>>I believe process status copy consume more time, may be below sequences are
>>needed;
>>- Stop the service on ACT-process.
>>- Copy on memory/on transaction status to shared memory.
>>
>>
>
>No need for this, the process could ALWAYS store its
>status in a shared memory status. This is just as
>fast as private memory, only more flexible
>
>
I don't think so, because ACT process must stop service logic to
takeover, if the service use network listen port.(ACT process need to
stop service and close socket to take over.)

>>- Takeover shared memory key to SBY process and release the shared memory
>>- SBY process access to shared memory.
>>
>>
>
>Which means the SBY process can attach to the shared
>memory region while the ACT process is running. It
>can then communicate with the ACT process through a
>socket ...
>
>
this makes software developer crazy....

>>- SBY process checks the memory and reset broken sessions.
>>- SBY process restart the service.
>>
>>
>
>... and the SBY process can take over immediately.
>The state machine running the SBY software can
>continue using the same data structures the ACT
>process was using beforehand, since they're in a
>shared memory region.
>
>
>>Some part may be parallelize, but seems the more data make service
>>disruption time longer...(It seems exceeds 100 milliseconds depends on
>>data size..) and process will be more complicated....makes more bugs...
>>
>>
>
>The data size should not be an issue, since the primary
>copy of the state is in the shared memory area.
>
>
For me, is seems very dangerous to estimate the primary copy is not
broken through status takeover..

>The state machine in the SBY process can directly run
>using those data structures, so no copying is needed.
>
>The only overhead will be inter-process communication,
>having the first process close file descriptors, yielding
>the CPU to the second process, which then opens up the
>devices again. We both know how long a context switch
>and an open() syscall take - negligable.
>
>The old version of the program can shut itself down
>after it knows the new version has taken over - in the
>background, without disrupting the now active process.
>
>
>
I think your assumption works on some type of process, but not for all
the process.
Some process use critical resources such as fixed network listen port
can not speed up so.
More importantly, the only process who prepare to use this mechanism
only allows to use quick process takeover. This cause software
development difficult.
The live patching does not require to implement such special techniques
on applications.


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-19 04:27:36

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Tue, Apr 19, 2005 at 11:14:27AM +0900, Takashi Ikebe wrote:

> this makes software developer crazy....

are you serious? how is live patching of .text easier than some of
the other suggestions which all are more or less sane and things like
gdb, oprofile, etc. will deal with w/o problems?

patching code in a running process is way complicated and messy, if
you think this is the easier solution i guess i have little more to
say

> For me, is seems very dangerous to estimate the primary copy is not
> broken through status takeover..

that would also be a problem for live patching too, if you have bad
state, you have bad state --- live patching doesn't change that

> Some process use critical resources such as fixed network listen
> port can not speed up so.

hand the fd off to another process

> More importantly, the only process who prepare to use this mechanism
> only allows to use quick process takeover.

no, i can mmap state or similar, hand fd's off and switch to another
process in a context switch... hot patching i bet is going to be
slower

how about you show up some code that needs this?

> This cause software development difficult.

i honestly doubt in most cases you can hand live patch faster and more
easily than having the application sensibly written and passing it off

please, prove me wrong, show us some code

> The live patching does not require to implement such special
> techniques on applications.

this is like saying live patching is a complicated in-kernel solution
for badly written userspace isn't it?

2005-04-19 05:20:31

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Sorry, I may mistake the point,
Chris Wedgwood wrote:

>>For me, is seems very dangerous to estimate the primary copy is not
>>broken through status takeover..
>>
>>
>
>that would also be a problem for live patching too, if you have bad
>state, you have bad state --- live patching doesn't change that
>
>
What I want to say is takeover may makes memory unstable, because there
are extra operations to reserve current (unstable) status to memory.
Live patching never force target process to reserve status to memory. Is
this make sense?

>>Some process use critical resources such as fixed network listen
>>port can not speed up so.
>>
>>
>
>hand the fd off to another process
>
>
I think the point is how long does it takes to hand the fd off to
another process.(means how long time the network port is unavailable??)

>>More importantly, the only process who prepare to use this mechanism
>>only allows to use quick process takeover.
>>
>>
>
>no, i can mmap state or similar, hand fd's off and switch to another
>process in a context switch... hot patching i bet is going to be
>slower
>
>how about you show up some code that needs this?
>
>
>>This cause software development difficult.
>>
>>
>
>i honestly doubt in most cases you can hand live patch faster and more
>easily than having the application sensibly written and passing it off
>
>please, prove me wrong, show us some code
>
>
Please see and try http://pannus.sourceforge.net
There includes commands and some samples.
On live patching, you never need to use shared memory, just prepare
fixed code, and just compile it as shared ibject, that's all. pretty
easy and fast to replace the functions.

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-19 05:53:13

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Tue, Apr 19, 2005 at 02:19:57PM +0900, Takashi Ikebe wrote:

> What I want to say is takeover may makes memory unstable, because
> there are extra operations to reserve current (unstable) status to
> memory.

mmap is coherent between processes

> Live patching never force target process to reserve status to memory. Is
> this make sense?

Not really. I don't see how it makes it any better or easier, just
different.

> I think the point is how long does it takes to hand the fd off to
> another process. (means how long time the network port is
> unavailable??)

Probably under 1 ms. Not long anyhow.

> Please see and try http://pannus.sourceforge.net

> There includes commands and some samples.

pannus-sample.tgz contains some pretty contrived examples, nothing
that anyone could really sensibly comment on

> On live patching, you never need to use shared memory, just prepare
> fixed code, and just compile it as shared ibject, that's all. pretty
> easy and fast to replace the functions.

it requires magic like a compiler and knowledge of the original
application.

if the application was written sensibly someone without access to the
application code could change this live taking over the previous
applications state even more easily --- and the code would be more
straightforward. so i still fail to see why this is needed.

2005-04-19 05:58:13

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Takashi Ikebe wrote:

>Sorry, I may mistake the point,
>Chris Wedgwood wrote:
>
>
>>that would also be a problem for live patching too, if you have bad
>>state, you have bad state --- live patching doesn't change that
>>
>>
>What I want to say is takeover may makes memory unstable, because there
>are extra operations to reserve current (unstable) status to memory.
>Live patching never force target process to reserve status to memory. Is
>this make sense?
>
>
Sorry, I misunderstand it, forget above comment, both methods are
possible to destroy memory.


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-20 04:18:51

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hello,
Chris Wedgwood wrote:

>
>
>
>>On live patching, you never need to use shared memory, just prepare
>>fixed code, and just compile it as shared ibject, that's all. pretty
>>easy and fast to replace the functions.
>>
>>
>
>it requires magic like a compiler and knowledge of the original
>application.
>
>
Well, Live patching is just a patch, so I think the developer of patch
should know the original source code well.

>if the application was written sensibly someone without access to the
>application code could change this live taking over the previous
>applications state even more easily --- and the code would be more
>straightforward. so i still fail to see why this is needed.
>
>
Well, as you said some application can do that, but some application can
not continue service with your suggestion.
please think about the process which use connection type communication
such as TCP(it's only example) between users and server. During status
copy, all the session between users and server are disconnected... can
not save the exiting service at all.
It's one example, but similar problems may occurs whenever processed use
the resources which are mainly controlled by kernel.

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-20 05:44:16

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Wed, Apr 20, 2005 at 01:18:23PM +0900, Takashi Ikebe wrote:

> Well, Live patching is just a patch, so I think the developer of
> patch should know the original source code well.

In which case they could fix the application.

> Well, as you said some application can do that, but some application
> can not continue service with your suggestion.

Such as?

> please think about the process which use connection type
> communication such as TCP(it's only example) between users and
> server. During status copy, all the session between users and server
> are disconnected...

They don't have to be.

> can not save the exiting service at all.

Yes they can.

> It's one example, but similar problems may occurs whenever processed
> use the resources which are mainly controlled by kernel.

What resources? We can migrate memory and file descriptors? What is
missing?

Anyhow, you seem hell bent on this despite showing any real evidence
it's useful or desirable... maybe a different audience for your
patches would help?

http://selenic.com/mailman/listinfo/kernel-mentors might be of value
to you.

2005-04-20 07:36:59

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hi,
I think basic assumption between us and you is not match..
Our assumption, the live patching is not for debug, but for the real
operation method to fix very very important process which can not stop.
Live patchin fix the important process's bug without disrupting process.

Chris Wedgwood wrote:
> On Wed, Apr 20, 2005 at 01:18:23PM +0900, Takashi Ikebe wrote:
>
>
>>Well, Live patching is just a patch, so I think the developer of
>>patch should know the original source code well.
>
> In which case they could fix the application.
>
Yes, so they provide us the patch module, and we want to apply the patch
as live patching.
>
>>Well, as you said some application can do that, but some application
>>can not continue service with your suggestion.
>
> Such as?
>
>>please think about the process which use connection type
>>communication such as TCP(it's only example) between users and
>>server. During status copy, all the session between users and server
>>are disconnected...
>
>
> They don't have to be.

???
To takeover the application status, connection type
communications(SOCK_STREAM) are need to be disconnected by close().
Same network port is not allowed to bind by multiple processes....

How can you do that??
Users don't want to disconnect,(and also we don't want to disconnect)
but server process need to it to takeover the status.

>>can not save the exiting service at all.
>
> Yes they can.
>
>>It's one example, but similar problems may occurs whenever processed
>>use the resources which are mainly controlled by kernel.
>
> What resources? We can migrate memory and file descriptors? What is
> missing?

For example,
current process's resouces of rlimit.
you nerver set current rusage to new process.
especialy, ru_utime and ru_stime is very important to critical applications.
I don't know much about resources, but there may be more....(I hope not..)

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]

2005-04-20 07:51:15

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Wed, Apr 20, 2005 at 04:35:07PM +0900, Takashi Ikebe wrote:

> I think basic assumption between us and you is not match...

No, I think at a high-level they do.

> Our assumption, the live patching is not for debug, but for the real
> operation method to fix very very important process which can not
> stop.

I understand that.

It might be though you could probably do what you want with some kind
of enhanced ptrace or debugging interface that would also be of value
to other people and probably simple than your proposed patch.

> Live patchin fix the important process's bug without disrupting
> process.

I understand that.

> To takeover the application status, connection type
> communications(SOCK_STREAM) are need to be disconnected by close().
> Same network port is not allowed to bind by multiple processes....

AF_UNIX socket with SCM_RIGHTS

> especialy, ru_utime and ru_stime is very important to critical
> applications.

how so? what is magical about these that can't be dealt with in
userspace should it span 2+ processes?

2005-04-20 08:00:00

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Chris Wedgwood wrote:
> On Wed, Apr 20, 2005 at 04:35:07PM +0900, Takashi Ikebe wrote:>
>
>>To takeover the application status, connection type
>>communications(SOCK_STREAM) are need to be disconnected by close().
>>Same network port is not allowed to bind by multiple processes....
>
>
> AF_UNIX socket with SCM_RIGHTS
>
hmm.. most internet base services will use TCPv4 TCPv6 SCTP...
AF_UNIX can not use as inter-nodes communication.


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]

2005-04-20 08:27:18

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Wed, Apr 20, 2005 at 04:57:31PM +0900, Takashi Ikebe wrote:

> hmm.. most internet base services will use TCPv4 TCPv6 SCTP...
> AF_UNIX can not use as inter-nodes communication.

You can send file descriptors (the actually file descriptors
themselves, not their contents) to another process over a socket.

A nearly ten-year old example is attached (ie. this isn't new or
magical or specific to Linux).


Attachments:
(No filename) (407.00 B)
sendfd.c (2.11 kB)
Download all attachments

2005-04-20 08:34:52

by Miquel van Smoorenburg

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

In article <[email protected]>,
Takashi Ikebe <[email protected]> wrote:
>Chris Wedgwood wrote:
>> On Wed, Apr 20, 2005 at 04:35:07PM +0900, Takashi Ikebe wrote:>
>>
>>>To takeover the application status, connection type
>>>communications(SOCK_STREAM) are need to be disconnected by close().
>>>Same network port is not allowed to bind by multiple processes....
>>
>>
>> AF_UNIX socket with SCM_RIGHTS
>>
>hmm.. most internet base services will use TCPv4 TCPv6 SCTP...
>AF_UNIX can not use as inter-nodes communication.

No, Chris means filedescriptor passing.

You can pass any existing open filedescriptor to another process
using an AF_UNIX socket.

For example, the existing running process creates a UNIX socket in
/var/run/mysocket that the new process can connect() to. The
processes can then not only exchange all kinds of information,
the old process can even send open filedescriptors over to
the new process without closing/re-opening.

See "man 7 unix", ANCILLARY MESSAGES -> SCM_RIGHTS

ANCILLARY MESSAGES
Ancillary data is sent and received using sendmsg(2) and recvmsg(2).
For historical reasons the ancillary message types listed below are
specified with a SOL_SOCKET type even though they are PF_UNIX specific.
To send them set the cmsg_level field of the struct cmsghdr to
SOL_SOCKET and the cmsg_type field to the type. For more information
see cmsg(3).


SCM_RIGHTS
Send or receive a set of open file descriptors from another pro-
cess. The data portion contains an integer array of the file
descriptors. The passed file descriptors behave as though they
have been created with dup(2).

Mike.

2005-04-20 08:47:28

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Chris Wedgwood wrote:
> On Wed, Apr 20, 2005 at 04:57:31PM +0900, Takashi Ikebe wrote:
>
>
>>hmm.. most internet base services will use TCPv4 TCPv6 SCTP...
>>AF_UNIX can not use as inter-nodes communication.
>
>
> You can send file descriptors (the actually file descriptors
> themselves, not their contents) to another process over a socket.
>
> A nearly ten-year old example is attached (ie. this isn't new or
> magical or specific to Linux).
>
>
>
> ------------------------------------------------------------------------
> int main()
> {
> int fds[2];
> int fd = -1;
> int rc = socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
Hmm interest enough,
But please see man.

NOTES
On Linux, the only supported domain for this call is AF_UNIX (or
syn-
onymously, AF_LOCAL). (Most implementations have the same
restric-
tion.)

Only for AF_UNIX..


Well, as many said Live patching is very historical & authoritative
function on especially carrier, telecom vendor.
If linux want to be adopted on mission critical world, this function is
esseintial.

--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]

2005-04-20 08:52:12

by Chris Wedgwood

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Wed, Apr 20, 2005 at 05:45:00PM +0900, Takashi Ikebe wrote:

> Only for AF_UNIX..

I'm sure that means AF_UNIX is restricted for the socket you use to
pass the file descriptors, not a restriction on the file descriptors
themselves. I don't see why the kernel would care what the
descriptors are.

> Well, as many said Live patching is very historical & authoritative
> function on especially carrier, telecom vendor.

Linux doesn't have it now, do it's not historical in the Linux space.

> If linux want to be adopted on mission critical world, this function
> is esseintial.

But Linux is used in mission critical places and we don't have that
feature.

2005-04-20 11:20:17

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Wed, 20 Apr 2005, Takashi Ikebe wrote:

> Well, as many said Live patching is very historical & authoritative
> function on especially carrier, telecom vendor.
> If linux want to be adopted on mission critical world, this function is
> esseintial.

Yes, if you want to use Linux in those scenarios you will
need to change the telco programs to use shared memory and
file descriptor passing, instead of live patching.

--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

2005-04-20 13:10:54

by Ralf Baechle

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 18, 2005 at 02:25:06AM -0700, Chris Wedgwood wrote:

> > The call switching folks have been doing live patching at least
> > since I worked on it, over 25 years ago. This is not just
> > marketing.
>
> That still doesn't explain *why* live patching is needed.

The more optimization a modern compiler does the less practical a patching
approach seems for anything but very trivial fixes.

I'd try a shared library based approach for on the fly updates.

Ralf

2005-04-20 15:10:07

by Chris Friesen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Rik van Riel wrote:
> On Wed, 20 Apr 2005, Takashi Ikebe wrote:

>>Well, as many said Live patching is very historical & authoritative
>>function on especially carrier, telecom vendor.
>>If linux want to be adopted on mission critical world, this function is
>>esseintial.

> Yes, if you want to use Linux in those scenarios you will
> need to change the telco programs to use shared memory and
> file descriptor passing, instead of live patching.

Unfortunately we're also dealing (in many cases) with pre-existing
software coming over from other OS's. The beancounters want to avoid
rewriting the millions of lines of application code, so they'd rather
add the missing support to the kernel.

If it doesn't go into mainline, we'll just end up with a bunch of
different telco-patches being maintained on the side. I highly doubt
all the applications will get fixed any time soon.

Chris

2005-04-20 15:12:00

by Chris Friesen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Ralf Baechle wrote:


> I'd try a shared library based approach for on the fly updates.

The version that I've seen imposed requirements on the application for
this to work properly.

There are tradeoffs either way.

Chris

2005-04-23 16:17:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Takashi Ikebe <[email protected]> writes:

> The patch was over 50k, so I separate it to each architecture and in line..
>
> This patch add function called "Live patching" which is defined on
> OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7 kernel.
> The live patching allows process to patch on-line (without restarting
> process) on i386 and x86_64 architectures, by overwriting jump assembly
> code on entry point of functions which you want to fix, to patched
> functions.

How exactly is this different from ptrace?
Seems just like a ptrace memcpy extension
Is the patching really that time critical that you cant do it
with normal ptrace?


> + if(((current->uid != tsk->euid) ||
> + (current->uid != tsk->suid) ||
> + (current->uid != tsk->uid) ||
> + (current->gid != tsk->egid) ||
> + (current->gid != tsk->sgid) ||
> + (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
> + // invalid user in sys_accesspvm
> + return -EPERM;
> + }
> +> + p = vmalloc(len);

This needs a limit.
annus-x86_64/arch/x86_64/kernel/entry.S
> --- linux-2.6.11.7-vanilla/arch/x86_64/kernel/entry.S 2005-04-08 03:57:30.000000000 +0900
> +++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/entry.S 2005-04-18 10:45:47.000000000 +0900
> @@ -214,6 +214,8 @@ sysret_check:
> /* Handle reschedules */
> /* edx: work, edi: workmask */
> sysret_careful:
> + cmpl $0,threadinfo_inipending(%rcx)
> + jne sysret_init

Put the check into the normal notify_resume work mask, not adding
a separate check into this critical fast path.

> CFI_ENDPROC
>
> /*
> + * In the case restorer calls rt_handlereturn, collect and store registers,
> + * and call rt_handlereturn with stored register struct.
> + */
> +ENTRY(stub_rt_handlereturn)

This seems quite pointless since ptrace and can change all registers
in a child.

Didnt review more.

-Andi

2005-04-25 02:11:40

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hello,
Andi Kleen wrote:

>Takashi Ikebe <[email protected]> writes:
>
>
>
>>The patch was over 50k, so I separate it to each architecture and in line..
>>
>>This patch add function called "Live patching" which is defined on
>>OSDL's carrier grade linux requiremnt definition to linux 2.6.11.7 kernel.
>>The live patching allows process to patch on-line (without restarting
>>process) on i386 and x86_64 architectures, by overwriting jump assembly
>>code on entry point of functions which you want to fix, to patched
>>functions.
>>
>>
>
>How exactly is this different from ptrace?
>Seems just like a ptrace memcpy extension
>Is the patching really that time critical that you cant do it
>with normal ptrace?
>
>
Only few patch modules are not so critical, however sometimes large
number of patches are applied at one time. In that case, time is very
critical with normal ptrace. As you know, normal ptrace need to target
process STOP whenever change the memory/registers.
Our approach is "do not stop the target process's execution as possible
as", because the target process can provide service during patch on SMP
machine (do not want to stop service due to patch).
If we load hundreds of patch modules at one time, I think it will goes
quite time critical..

>>+ if(((current->uid != tsk->euid) ||
>>+ (current->uid != tsk->suid) ||
>>+ (current->uid != tsk->uid) ||
>>+ (current->gid != tsk->egid) ||
>>+ (current->gid != tsk->sgid) ||
>>+ (current->gid != tsk->gid)) && !capable(CAP_SYS_PANNUS)) {
>>+ // invalid user in sys_accesspvm
>>+ return -EPERM;
>>+ }
>>+> + p = vmalloc(len);
>>
>>
>
>This needs a limit.
>
>
Thank you, we'll fix this soon.

>annus-x86_64/arch/x86_64/kernel/entry.S
>
>
>>--- linux-2.6.11.7-vanilla/arch/x86_64/kernel/entry.S 2005-04-08 03:57:30.000000000 +0900
>>+++ linux-2.6.11.7-pannus-x86_64/arch/x86_64/kernel/entry.S 2005-04-18 10:45:47.000000000 +0900
>>@@ -214,6 +214,8 @@ sysret_check:
>> /* Handle reschedules */
>> /* edx: work, edi: workmask */
>> sysret_careful:
>>+ cmpl $0,threadinfo_inipending(%rcx)
>>+ jne sysret_init
>>
>>
>
>Put the check into the normal notify_resume work mask, not adding
>a separate check into this critical fast path.
>
>
OK, we'll fix this soon.

>> CFI_ENDPROC
>>
>> /*
>>+ * In the case restorer calls rt_handlereturn, collect and store registers,
>>+ * and call rt_handlereturn with stored register struct.
>>+ */
>>+ENTRY(stub_rt_handlereturn)
>>
>>
>
>This seems quite pointless since ptrace and can change all registers
>in a child.
>
>
Well, this can change as you said, but I think, this makes target
process stopping time increase.
Because, to control target process's (patch module's) initialization,
the command process should know the target process's status and then
stop with ptrace.
Currently rt_handlereturn works on target process's context like signal
handler return, so, I think there is minimum time loss on target process.
If command process controls the target process's initialization, this
seems target process's stopping time increasing.
Well, may be our idea is wrong, please tell us.

Thank you your advice!

>Didnt review more.
>
>-Andi
>
>


--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-25 02:50:22

by Kyle Moffett

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

If you want that exact functionality, do this:

At program start, spawn a new thread:
1) Open a UNIX socket (/var/run/someapp_live_patch.sock)
2) poll() that socket for a connection.
3) When you get a connection, do your own security checks
4) If it's ok, then map the specified file into memory
5) Read a table of crap to patch from the file
6) Do the patching, being careful to avoid the millions of
races involved for each CPU, *especially* regarding the
separate icache and dcache on CPUs like PPC and such.
7) Go back to step 2

If you want equivalent functionality but much safer and not CPU
dependent and full of hand-coded assembly:

1) open(), mmap(), and mlock() the file (/var/lib/someapp/data)
2) Spawn normal operation threads
3) Spawn a new hot-patch thread:
1) Open a UNIX socket (/var/run/someapp_live_patch.sock)
2) poll() that socket for a connection.
3) When you get one, coordinate with the new process as it
attaches itself to /var/lib/someapp/data
4) Handle shared locking of parts of /var/lib/someapp/data
5) Send it your listen() file-descriptors over the socket.
6) Wait for the other process to signal it's ready.
7) Stop accepting new connections on the socket.
8) Send file-descriptors for current connections
9) Cleanup and quit

When live-patching:
1) connect to the socket /var/run/someapp_live_patch.sock
2) open(), mmap() and mlock() /var/lib/someapp/data
3) Coordinate with the other process via the socket
4) Receive the listen() file-descriptors over the socket.
5) Set up the shared data locking
6) Spawn normal operation threads
7) Signal readiness
8) Receive file-descriptors for current connections
9) Spawn threads for them too.
10) Spawn a new hot-patch thread as above

Cheers,
Kyle Moffett

-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GCM/CS/IT/U d- s++: a18 C++++>$ UB/L/X/*++++(+)>$ P+++(++++)>$
L++++(+++) E W++(+) N+++(++) o? K? w--- O? M++ V? PS+() PE+(-) Y+
PGP+++ t+(+++) 5 X R? tv-(--) b++++(++) DI+ D+ G e->++++$ h!*()>++$ r
!y?(-)
------END GEEK CODE BLOCK------


2005-04-25 10:40:23

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Kyle, thank you so much for your detailed information.
If you design completely new software, your suggestion is very useful!

Unfortunately, we carrier have very many exiting software and try to run
on Linux.
We need to seek the way which can apply to exiting software also...

Kyle Moffett wrote:

> If you want that exact functionality, do this:
>
> At program start, spawn a new thread:
> 1) Open a UNIX socket (/var/run/someapp_live_patch.sock)
> 2) poll() that socket for a connection.
> 3) When you get a connection, do your own security checks
> 4) If it's ok, then map the specified file into memory
> 5) Read a table of crap to patch from the file
> 6) Do the patching, being careful to avoid the millions of
> races involved for each CPU, *especially* regarding the
> separate icache and dcache on CPUs like PPC and such.
> 7) Go back to step 2
>
> If you want equivalent functionality but much safer and not CPU
> dependent and full of hand-coded assembly:
>
> 1) open(), mmap(), and mlock() the file (/var/lib/someapp/data)
> 2) Spawn normal operation threads
> 3) Spawn a new hot-patch thread:
> 1) Open a UNIX socket (/var/run/someapp_live_patch.sock)
> 2) poll() that socket for a connection.
> 3) When you get one, coordinate with the new process as it
> attaches itself to /var/lib/someapp/data
> 4) Handle shared locking of parts of /var/lib/someapp/data
> 5) Send it your listen() file-descriptors over the socket.
> 6) Wait for the other process to signal it's ready.
> 7) Stop accepting new connections on the socket.
> 8) Send file-descriptors for current connections
> 9) Cleanup and quit
>
> When live-patching:
> 1) connect to the socket /var/run/someapp_live_patch.sock
> 2) open(), mmap() and mlock() /var/lib/someapp/data
> 3) Coordinate with the other process via the socket
> 4) Receive the listen() file-descriptors over the socket.
> 5) Set up the shared data locking
> 6) Spawn normal operation threads
> 7) Signal readiness
> 8) Receive file-descriptors for current connections
> 9) Spawn threads for them too.
> 10) Spawn a new hot-patch thread as above
>
> Cheers,
> Kyle Moffett
>
> -----BEGIN GEEK CODE BLOCK-----
> Version: 3.12
> GCM/CS/IT/U d- s++: a18 C++++>$ UB/L/X/*++++(+)>$ P+++(++++)>$
> L++++(+++) E W++(+) N+++(++) o? K? w--- O? M++ V? PS+() PE+(-) Y+
> PGP+++ t+(+++) 5 X R? tv-(--) b++++(++) DI+ D+ G e->++++$ h!*()>++$ r
> !y?(-)
> ------END GEEK CODE BLOCK------
>
>
> -
> To unsubscribe from this list: send the line "unsubscribe
> linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/



--
Takashi Ikebe
NTT Network Service Systems Laboratories
9-11, Midori-Cho 3-Chome Musashino-Shi,
Tokyo 180-8585 Japan
Tel : +81 422 59 4246, Fax : +81 422 60 4012
e-mail : [email protected]


2005-04-25 11:15:46

by Kyle Moffett

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7


On Apr 25, 2005, at 06:39, Takashi Ikebe wrote:
> Kyle Moffett wrote:
>
>> If you want that exact functionality, do this:
>>
>> At program start, spawn a new thread:
>> 1) Open a UNIX socket (/var/run/someapp_live_patch.sock)
>> 2) poll() that socket for a connection.
>> 3) When you get a connection, do your own security checks
>> 4) If it's ok, then map the specified file into memory
>> 5) Read a table of crap to patch from the file
>> 6) Do the patching, being careful to avoid the millions of
>> races involved for each CPU, *especially* regarding the
>> separate icache and dcache on CPUs like PPC and such.
>> 7) Go back to step 2
> Kyle, thank you so much for your detailed information.
> If you design completely new software, your suggestion is very useful!
>
> Unfortunately, we carrier have very many exiting software and try to
> run
> on Linux.
> We need to seek the way which can apply to exiting software also...

If you notice, the above method has only minimal changes from
your mmap3 stuff, except without needing kernel support. One
thing to remember, though, as there _is_ a very clean method
to do this from userspace, therefore you are not likely to
get much sympathy on this list.

I suggest you try adding a new hotpatch thread to your code,
as above, then use it to implement the mmap3 and other tasks
necessary for live patching instead of in kernel space.

Cheers,
Kyle Moffett

-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GCM/CS/IT/U d- s++: a18 C++++>$ UB/L/X/*++++(+)>$ P+++(++++)>$
L++++(+++) E W++(+) N+++(++) o? K? w--- O? M++ V? PS+() PE+(-) Y+
PGP+++ t+(+++) 5 X R? tv-(--) b++++(++) DI+ D+ G e->++++$ h!*()>++$ r
!y?(-)
------END GEEK CODE BLOCK------


2005-04-25 15:52:51

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

Hi!

> Kyle, thank you so much for your detailed information.
> If you design completely new software, your suggestion is very useful!
>
> Unfortunately, we carrier have very many exiting software and try to run
> on Linux.
> We need to seek the way which can apply to exiting software also...

"We want to do the wrong thing because we think its easier".

Okay, you are free to do that, but don't try to push that into
mainline kernel. Maintain your own patches; if that seems too hard, do
the right thing.
Pavel

--
Boycott Kodak -- for their patent abuse against Java.

2005-04-25 15:56:12

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, Apr 25, 2005 at 07:39:51PM +0900, Takashi Ikebe wrote:
> Kyle, thank you so much for your detailed information.
> If you design completely new software, your suggestion is very useful!
>
> Unfortunately, we carrier have very many exiting software and try to run
> on Linux.
> We need to seek the way which can apply to exiting software also...

ptrace can all do this, even with an existing kernel.
Your full patch is just a funky ptrace equivalent as far as I can see.


-Andi

2005-04-25 16:40:16

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Mon, 25 Apr 2005 19:39:51 +0900, Takashi Ikebe said:

> Unfortunately, we carrier have very many exiting software and try to run
> on Linux.
> We need to seek the way which can apply to exiting software also...

You *really* want to take the time to re-write the software to do things
The Linux Way. If you're looking at doing on-the-fly patching, you're
probably also carrying around a lot of *other* ugly cruft to make this
creeping horror work on Linux. In fact, I'd not be surprised if you have
a shim layer to make the compatibility layer for the *previous* system
work on Linux...

I'm reminded of a (possibly apocryphal) quote from an ATT spokesperson from
1988 or so, when a misplaced comma in a patch kept crashing the long-distance
phone network. When asked "Why don't you just reboot the affected switches?"
his response was "This assumes that the switch had ever been booted in the
first place". (Apparently, the *whole thing* had been on-the-fly replaced/patched
without an actual reload happening...)

Gaaahhh! :)


Attachments:
(No filename) (226.00 B)

2005-04-26 01:48:17

by Takashi Ikebe

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

I think that's the common sense in every carrier.
If we reboot the switch, the service will be disrupted.
The phone network is lifeline, and does not allow to be disrupt by just
bug fix.
I think same kind of function is needed in many real
enterprise/mission-critical/business area.

All do with ptrace may affect target process's time critical task. (need
to stop target process whenever fix)
All implement in user application costs too much, need to implement all
the application...(and I do not know this approach really works on time
critical applications yet.)
There are clear demand to realize this common and GPL-ed function....

[email protected] wrote:
> On Mon, 25 Apr 2005 19:39:51 +0900, Takashi Ikebe said:
>
>
>>Unfortunately, we carrier have very many exiting software and try to run
>>on Linux.
>>We need to seek the way which can apply to exiting software also...
>
>
> You *really* want to take the time to re-write the software to do things
> The Linux Way. If you're looking at doing on-the-fly patching, you're
> probably also carrying around a lot of *other* ugly cruft to make this
> creeping horror work on Linux. In fact, I'd not be surprised if you have
> a shim layer to make the compatibility layer for the *previous* system
> work on Linux...
>
> I'm reminded of a (possibly apocryphal) quote from an ATT spokesperson from
> 1988 or so, when a misplaced comma in a patch kept crashing the long-distance
> phone network. When asked "Why don't you just reboot the affected switches?"
> his response was "This assumes that the switch had ever been booted in the
> first place". (Apparently, the *whole thing* had been on-the-fly replaced/patched
> without an actual reload happening...)
>
> Gaaahhh! :)
>

2005-04-26 02:15:54

by Kyle Moffett

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Apr 25, 2005, at 21:34, Takashi Ikebe wrote:
> [email protected] wrote:
>> When asked "Why don't you just reboot the affected switches?" his
>> response was "This assumes that the switch had ever been booted in
>> the first place". (Apparently, the *whole thing* had been
>> on-the-fly replaced/patched without an actual reload happening...)
>> Gaaahhh! :)
>
> I think that's the common sense in every carrier.

That is definitely not common sense. It may be good business
practice, but those are two *entirely* different things.

> If we reboot the switch, the service will be disrupted.

Yes. My personal favorite solution to this problem is HeartBeat,
some Open-Source software that is very good at maintaining high
availability. With a properly written multi-system clustering
switch application that utilizes the Linux Virtual-Server tools,
you could reasonably efficiently run a system such that you can
reboot any individual system without any loss of service.

> The phone network is lifeline, and does not allow to be disrupt
> by just bug fix. I think same kind of function is needed in many
> real enterprise/mission-critical/business area.

But you miss the point. Linux is *NOT* about "business", or
"enterprise", or "mission-critical". Linux is (at least to
many hackers) about hacking, having fun, and Good Design(TM).

> All do with ptrace may affect target process's time critical
> task. (need to stop target process whenever fix)

So don't do it with ptrace!!! I've given you one other method
that uses minimal changes to existing software and emulates the
crappy mmap3 call you keep trying to push.

> All implement in user application costs too much,

What about one of the dozen other offered methods?

> need to implement all the application...

So why not write a utility library? You'd need to "implement
all in the kernel", too, and since it can be done better in
userspace, let's keep out the bloat while we're at it.

> (and I do not know this approach really works on time critical
> applications yet.)

So test it! You're clearly working for a big corporation with
the money and resources to develop something like this, so do
so, and if you get something that works well, *and* uses good
design, we'll welcome patches!

> There are clear demand to realize this common and GPL-ed
> function....

The kernel is not about business, demand, or what the CEO of
some big-name company wants. The kernel strives for the goal
of "Good Engineering (TM)".


Cheers,
Kyle Moffett

-----BEGIN GEEK CODE BLOCK-----
Version: 3.12
GCM/CS/IT/U d- s++: a18 C++++>$ UB/L/X/*++++(+)>$ P+++(++++)>$
L++++(+++) E W++(+) N+++(++) o? K? w--- O? M++ V? PS+() PE+(-) Y+
PGP+++ t+(+++) 5 X R? tv-(--) b++++(++) DI+ D+ G e->++++$ h!*()>++$ r
!y?(-)
------END GEEK CODE BLOCK------


2005-04-26 09:36:59

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On ?t 26-04-05 10:34:56, Takashi Ikebe wrote:
> I think that's the common sense in every carrier.
> If we reboot the switch, the service will be disrupted.
> The phone network is lifeline, and does not allow to be disrupt by just
> bug fix.
> I think same kind of function is needed in many real
> enterprise/mission-critical/business area.
>
> All do with ptrace may affect target process's time critical task. (need
> to stop target process whenever fix)
> All implement in user application costs too much, need to implement all
> the application...(and I do not know this approach really works on time
> critical applications yet.)
> There are clear demand to realize this common and GPL-ed function....
~~~~~~~~~~~~~~~~
I had very strong urge to reply with "<plonk>" here.

Clearly noone but you wants to make kernel more ugly just for "faster
ptrace". If you want faster ptrace, fine, advertise it as such and
provide nice and small patch to make it faster.

If you are going to handwave about "clear demand", well, find some
other list to troll on.
Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-26 13:05:57

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH x86_64] Live Patching Function on 2.6.11.7

On Tue, Apr 26, 2005 at 10:34:56AM +0900, Takashi Ikebe wrote:
> I think that's the common sense in every carrier.
> If we reboot the switch, the service will be disrupted.
> The phone network is lifeline, and does not allow to be disrupt by just
> bug fix.
> I think same kind of function is needed in many real
> enterprise/mission-critical/business area.
>
> All do with ptrace may affect target process's time critical task. (need
> to stop target process whenever fix)

Sorry, but what are your exact time requirements for this?

Remember any x86-64 CPU is really fast and it can do a _lot_ of ptrace
operations in a very short time.

Just a vague "it may be too slow" is not enough justification to
push a lot of redundant code into the kernel. Also if ptrace
should be really too slow (which I doubt, but you are welcome
to show some numbers together with real time requirements from
a real system) then we could optimize ptrace for this, e.g.
by adding a ptrace subcommand to copy whole memory blocks
more efficiently or maybe even do a mmap like thing.

But unless someone actually demonstrates this is needed it seems far overkill.

> All implement in user application costs too much, need to implement all
> the application...(and I do not know this approach really works on time
> critical applications yet.)

I think you have a lot of unproved and doubtful assumptions here.

-Andi