This patch adds the new PTRACE_VM_SKIPCALL and PTRACE_VM_SKIPEXIT
tags for ptrace's addr parameter.
In this way it is possible to (eventually) get rid of PTRACE_SYSEMU
PTRACE_SYSEMU_SINGLESTEP, while providing not only the same features
but a more general support for Virtual Machines.
Part#2: user-mode Linux support.
User-mode Linux by this patch uses PTRACE_VM of the hosting operating system
and provides PTRACE_VM to its processes.
UML tests at startup which features are provided and uses PTRACE_VM or
PTRACE_SYSEMU (or nothing). PTRACE_VM and/or PTRACE_SYSEMU support can be
disabled by command line flags.
renzo
Signed-off-by: Renzo Davoli <[email protected]>
---
diff -Naur linux-2.6.29-rc7-vm1/arch/um/include/shared/kern_util.h linux-2.6.29-rc7-vm2/arch/um/include/shared/kern_util.h
--- linux-2.6.29-rc7-vm1/arch/um/include/shared/kern_util.h 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/include/shared/kern_util.h 2009-03-06 20:33:49.000000000 +0100
@@ -57,7 +57,7 @@
extern unsigned long to_irq_stack(unsigned long *mask_out);
extern unsigned long from_irq_stack(int nested);
-extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
+extern int syscall_trace(struct uml_pt_regs *regs, int entryexit);
extern int singlestepping(void *t);
extern void segv_handler(int sig, struct uml_pt_regs *regs);
diff -Naur linux-2.6.29-rc7-vm1/arch/um/include/shared/ptrace_user.h linux-2.6.29-rc7-vm2/arch/um/include/shared/ptrace_user.h
--- linux-2.6.29-rc7-vm1/arch/um/include/shared/ptrace_user.h 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/include/shared/ptrace_user.h 2009-03-06 20:33:49.000000000 +0100
@@ -40,9 +40,20 @@
#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
#endif
+/* these constant should eventually enter in sys/ptrace.h */
+#ifndef PTRACE_SYSCALL_SKIPCALL
+#define PTRACE_SYSCALL_SKIPCALL 0x6
+#endif
+#ifndef PTRACE_SYSCALL_SKIPEXIT
+#define PTRACE_SYSCALL_SKIPEXIT 0x2
+#endif
+
void set_using_sysemu(int value);
int get_using_sysemu(void);
extern int sysemu_supported;
+void set_using_sysptvm(int value);
+int get_using_sysptvm(void);
+extern int sysptvm_supported;
#define SELECT_PTRACE_OPERATION(sysemu_mode, singlestep_mode) \
(((int[3][3] ) { \
diff -Naur linux-2.6.29-rc7-vm1/arch/um/kernel/process.c linux-2.6.29-rc7-vm2/arch/um/kernel/process.c
--- linux-2.6.29-rc7-vm1/arch/um/kernel/process.c 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/kernel/process.c 2009-03-06 20:33:49.000000000 +0100
@@ -322,7 +322,9 @@
}
static atomic_t using_sysemu = ATOMIC_INIT(0);
+static atomic_t using_sysptvm = ATOMIC_INIT(0);
int sysemu_supported;
+int sysptvm_supported;
void set_using_sysemu(int value)
{
@@ -336,6 +338,16 @@
return atomic_read(&using_sysemu);
}
+void set_using_sysptvm(int value)
+{
+ atomic_set(&using_sysptvm, value);
+}
+
+int get_using_sysptvm(void)
+{
+ return atomic_read(&using_sysptvm);
+}
+
static int proc_read_sysemu(char *buf, char **start, off_t offset, int size,int *eof, void *data)
{
if (snprintf(buf, size, "%d\n", get_using_sysemu()) < size)
@@ -358,27 +370,63 @@
return count;
}
-int __init make_proc_sysemu(void)
+
+static int proc_read_sysptvm(char *buf, char **start, off_t offset, int size,int *eof, void *data)
{
- struct proc_dir_entry *ent;
- if (!sysemu_supported)
- return 0;
+ int sysptvm=(get_using_sysptvm() != 0);
+ if (snprintf(buf, size, "%d\n", sysptvm) < size)
+ /* No overflow */
+ *eof = 1;
- ent = create_proc_entry("sysemu", 0600, NULL);
+ return strlen(buf);
+}
- if (ent == NULL)
- {
- printk(KERN_WARNING "Failed to register /proc/sysemu\n");
- return 0;
- }
+static int proc_write_sysptvm(struct file *file,const char __user *buf, unsigned long count,void *data)
+{
+ char tmp[2];
+
+ if (copy_from_user(tmp, buf, 1))
+ return -EFAULT;
+
+ if (tmp[0] == '0')
+ set_using_sysptvm(0);
+ if (tmp[0] == '1')
+ set_using_sysemu(/* XXX */ 6);
+ /* We use the first char, but pretend to write everything */
+ return count;
+}
- ent->read_proc = proc_read_sysemu;
- ent->write_proc = proc_write_sysemu;
+int __init make_proc_sysemu_or_sysptvm(void)
+{
+ struct proc_dir_entry *ent;
+ if (sysptvm_supported) {
+ ent = create_proc_entry("sysptvm", 0600, NULL);
+
+ if (ent == NULL)
+ {
+ printk(KERN_WARNING "Failed to register /proc/sysptvm\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysptvm;
+ ent->write_proc = proc_write_sysptvm;
+ } else if (sysemu_supported) {
+ ent = create_proc_entry("sysemu", 0600, NULL);
+
+ if (ent == NULL)
+ {
+ printk(KERN_WARNING "Failed to register /proc/sysemu\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysemu;
+ ent->write_proc = proc_write_sysemu;
+ }
return 0;
}
-late_initcall(make_proc_sysemu);
+late_initcall(make_proc_sysemu_or_sysptvm);
int singlestepping(void * t)
{
diff -Naur linux-2.6.29-rc7-vm1/arch/um/kernel/ptrace.c linux-2.6.29-rc7-vm2/arch/um/kernel/ptrace.c
--- linux-2.6.29-rc7-vm1/arch/um/kernel/ptrace.c 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/kernel/ptrace.c 2009-03-06 20:33:49.000000000 +0100
@@ -81,6 +86,8 @@
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace &= ~PT_SYSCALL_MASK;
+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
child->exit_code = data;
wake_up_process(child);
ret = 0;
@@ -107,7 +114,9 @@
ret = -EIO;
if (!valid_signal(data))
break;
+ child->ptrace &= ~PT_SYSCALL_MASK;
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
set_singlestepping(child, 1);
child->exit_code = data;
/* give it a chance to run. */
@@ -250,7 +259,7 @@
* XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
* PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
*/
-void syscall_trace(struct uml_pt_regs *regs, int entryexit)
+int syscall_trace(struct uml_pt_regs *regs, int entryexit)
{
int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
int tracesysgood;
@@ -272,10 +281,13 @@
send_sigtrap(current, regs, 0);
if (!test_thread_flag(TIF_SYSCALL_TRACE))
- return;
+ return 0;
if (!(current->ptrace & PT_PTRACED))
- return;
+ return 0;
+
+ if (entryexit && (current->ptrace & PT_SYSCALL_SKIPEXIT))
+ return 0;
/*
* the 0x80 provides a way for the tracing parent to distinguish
@@ -296,4 +308,8 @@
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
+ if (!entryexit && (current->ptrace & PT_SYSCALL_SKIPCALL))
+ return 1;
+ else
+ return 0;
}
diff -Naur linux-2.6.29-rc7-vm1/arch/um/kernel/skas/syscall.c linux-2.6.29-rc7-vm2/arch/um/kernel/skas/syscall.c
--- linux-2.6.29-rc7-vm1/arch/um/kernel/skas/syscall.c 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/kernel/skas/syscall.c 2009-03-06 20:33:49.000000000 +0100
@@ -17,8 +17,9 @@
struct pt_regs *regs = container_of(r, struct pt_regs, regs);
long result;
int syscall;
+ int skip_call;
- syscall_trace(r, 0);
+ skip_call=syscall_trace(r, 0);
/*
* This should go in the declaration of syscall, but when I do that,
@@ -29,12 +30,14 @@
* gcc version 4.0.1 20050727 (Red Hat 4.0.1-5)
* in case it's a compiler bug.
*/
- syscall = UPT_SYSCALL_NR(r);
- if ((syscall >= NR_syscalls) || (syscall < 0))
- result = -ENOSYS;
- else result = EXECUTE_SYSCALL(syscall, regs);
+ if (skip_call == 0) {
+ syscall = UPT_SYSCALL_NR(r);
+ if ((syscall >= NR_syscalls) || (syscall < 0))
+ result = -ENOSYS;
+ else result = EXECUTE_SYSCALL(syscall, regs);
- REGS_SET_SYSCALL_RETURN(r->gp, result);
+ REGS_SET_SYSCALL_RETURN(r->gp, result);
+ }
syscall_trace(r, 1);
}
diff -Naur linux-2.6.29-rc7-vm1/arch/um/os-Linux/skas/process.c linux-2.6.29-rc7-vm2/arch/um/os-Linux/skas/process.c
--- linux-2.6.29-rc7-vm1/arch/um/os-Linux/skas/process.c 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/os-Linux/skas/process.c 2009-03-06 20:33:49.000000000 +0100
@@ -157,7 +157,7 @@
* (in local_using_sysemu
*/
static void handle_trap(int pid, struct uml_pt_regs *regs,
- int local_using_sysemu)
+ int local_using_sysptvm_or_sysemu)
{
int err, status;
@@ -167,7 +167,7 @@
/* Mark this as a syscall */
UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
- if (!local_using_sysemu)
+ if (!local_using_sysptvm_or_sysemu)
{
err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
__NR_getpid);
@@ -354,6 +354,7 @@
int err, status, op, pid = userspace_pid[0];
/* To prevent races if using_sysemu changes under us.*/
int local_using_sysemu;
+ int local_using_sysptvm;
if (getitimer(ITIMER_VIRTUAL, &timer))
printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
@@ -375,11 +376,12 @@
/* Now we set local_using_sysemu to be used for one loop */
local_using_sysemu = get_using_sysemu();
+ local_using_sysptvm = get_using_sysptvm();
op = SELECT_PTRACE_OPERATION(local_using_sysemu,
singlestepping(NULL));
- if (ptrace(op, pid, 0, 0)) {
+ if (ptrace(op, pid, local_using_sysptvm, 0)) {
printk(UM_KERN_ERR "userspace - ptrace continue "
"failed, op = %d, errno = %d\n", op, errno);
fatal_sigsegv();
diff -Naur linux-2.6.29-rc7-vm1/arch/um/os-Linux/start_up.c linux-2.6.29-rc7-vm2/arch/um/os-Linux/start_up.c
--- linux-2.6.29-rc7-vm1/arch/um/os-Linux/start_up.c 2009-03-06 20:32:34.000000000 +0100
+++ linux-2.6.29-rc7-vm2/arch/um/os-Linux/start_up.c 2009-03-06 20:33:49.000000000 +0100
@@ -198,6 +198,35 @@
" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n"
" information.\n\n");
+/* Changed only during early boot */
+static int force_sysptvm_disabled = 0;
+
+static int __init nosysptvm_cmd_param(char *str, int* add)
+{
+ force_sysptvm_disabled = 1;
+ return 0;
+}
+
+__uml_setup("nosysptvm", nosysptvm_cmd_param,
+ "nosysptvm\n"
+ " Turns off syscall emulation tags for ptrace (ptrace_vm) on.\n"
+ " Ptrace_vm is a feature introduced by Renzo Davoli. It changes\n"
+ " behaviour of ptrace() and helps reducing host context switch rate.\n"
+ "\n");
+
+static int use_sysemu = 0;
+
+static int __init usesysemu_cmd_param(char *str, int* add)
+{
+ use_sysemu = 1;
+ return 0;
+}
+
+__uml_setup("usesysemu", usesysemu_cmd_param,
+ "usesysemu\n"
+ " Use sysemu instead of sysptvm even when the kernel supports it.\n\n"
+ );
+
static void __init check_sysemu(void)
{
unsigned long regs[MAX_REG_NR];
@@ -293,6 +322,102 @@
non_fatal("missing\n");
}
+/* test thread code. This thread is started only to test
+ * which features are provided by the linux kernel */
+static int sysptvm_child(void *arg)
+{
+ int *featurep=arg;
+ int p[2]={-1,-1};
+ pid_t pid=os_getpid();
+ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){
+ perror("ptrace test_ptracemulti");
+ kill(pid, SIGKILL);
+ }
+ kill(pid, SIGSTOP);
+ *featurep=0;
+ os_getpid();
+ /* if it reaches this point in 1 stop it means that
+ * PTRACE_SYSCALL_SKIPEXIT works */
+ *featurep=PTRACE_SYSCALL_SKIPEXIT;
+ pipe(p);
+ /* if after a PTRACE_SYSCALL_SKIPCALL p[0] is already <0
+ * pipe has been really skipped */
+ if (p[0] < 0)
+ *featurep=PTRACE_SYSCALL_SKIPCALL;
+ else { /* clean up everything */
+ close(p[0]);
+ close(p[1]);
+ }
+ return 0;
+}
+
+/* kernel feature test:
+ * it returns:
+ * -1 error
+ * 0 old PTRACE_SYSCALL (addr is ignored)
+ * PTRACE_SYSCALL_SKIPEXIT: just skip_exit is provided
+ * PTRACE_SYSCALL_SKIPCALL: the entire syntax is implemented
+ * by the running kernel */
+static int __init test_ptrace_sysptvm(void) {
+ int pid, status, rv, feature;
+ static char stack[1024];
+ feature=0;
+
+ if((pid = clone(sysptvm_child, &stack[1020], SIGCHLD | CLONE_VM, &feature)) < 0)
+ return 0;
+ if(waitpid(pid, &status, WUNTRACED) < 0){
+ kill(pid, SIGKILL);
+ return 0;
+ }
+ /* restart and wait for the next syscall (getpid)*/
+ rv=ptrace(PTRACE_SYSCALL, pid, 0, 0);
+ if(waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /* try to skip the exit call */
+ rv=ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPEXIT, 0);
+ if (rv < 0)
+ goto out;
+ /* wait for the next stop */
+ if(waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /* if feature is already 0 it means that this is the exit call,
+ * and it has not been skipped, otherwise this is the
+ * entry call for the system call "time" */
+ if (feature<PTRACE_SYSCALL_SKIPEXIT)
+ goto out;
+ /* restart (time) and and try to skip the entire call */
+ rv=ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPCALL, 0);
+ if(waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+out:
+ ptrace(PTRACE_KILL,pid,0,0);
+ /* eliminate zombie */
+ if(waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+ return feature;
+}
+
+static int __init check_sysptvm(void)
+{
+ int feature=test_ptrace_sysptvm();
+
+ non_fatal("Checking ptrace new tags for syscall emulation...");
+ if (feature==PTRACE_SYSCALL_SKIPCALL) {
+ sysptvm_supported=1;
+ non_fatal("OK");
+ if (!force_sysptvm_disabled) {
+ set_using_sysptvm(PTRACE_SYSCALL_SKIPCALL);
+ non_fatal("\n");
+ return 1;
+ } else {
+ non_fatal(" (disabled)\n");
+ return 0;
+ }
+ } else
+ non_fatal("unsupported\n");
+ return 0;
+}
+
static void __init check_ptrace(void)
{
int pid, syscall, n, status;
@@ -330,7 +455,8 @@
}
stop_ptraced_child(pid, 0, 1);
non_fatal("OK\n");
- check_sysemu();
+ if (use_sysemu || !check_sysptvm())
+ check_sysemu();
}
extern void check_tmpexec(void);
* Renzo Davoli <[email protected]> wrote:
> +/* test thread code. This thread is started only to test
> + * which features are provided by the linux kernel */
> +static int sysptvm_child(void *arg)
> +{
> + int *featurep=arg;
> + int p[2]={-1,-1};
> + pid_t pid=os_getpid();
> + if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){
> + perror("ptrace test_ptracemulti");
> + kill(pid, SIGKILL);
> + }
> + kill(pid, SIGSTOP);
> + *featurep=0;
> + os_getpid();
> + /* if it reaches this point in 1 stop it means that
> + * PTRACE_SYSCALL_SKIPEXIT works */
> + *featurep=PTRACE_SYSCALL_SKIPEXIT;
> + pipe(p);
> + /* if after a PTRACE_SYSCALL_SKIPCALL p[0] is already <0
> + * pipe has been really skipped */
> + if (p[0] < 0)
> + *featurep=PTRACE_SYSCALL_SKIPCALL;
> + else { /* clean up everything */
> + close(p[0]);
> + close(p[1]);
> + }
> + return 0;
Please check Documentation/CodingStyle. Every second line above
violates it. scripts/checkpatch.pl can help out with the more
obvious ones.
Ingo
> Please check Documentation/CodingStyle. Every second line above
> violates it. scripts/checkpatch.pl can help out with the more
> obvious ones.
Ingo,
Thank you for your comment.
You are right, I beg your pardon.
I have updated the patch, now it should be (more) consistent
with the Coding Style specifications.
This patch adds the new PTRACE_VM_SKIPCALL and PTRACE_VM_SKIPEXIT
tags for ptrace's addr parameter.
In this way it is possible to (eventually) get rid of PTRACE_SYSEMU
PTRACE_SYSEMU_SINGLESTEP, while providing not only the same features
but a more general support for Virtual Machines.
Part#2: user-mode Linux support.
User-mode Linux by this patch uses PTRACE_VM of the hosting operating system
and provides PTRACE_VM to its processes.
UML tests at startup which features are provided and uses PTRACE_VM or
PTRACE_SYSEMU (or nothing). PTRACE_VM and/or PTRACE_SYSEMU support can be
disabled by command line flags.
renzo
Signed-off-by: Renzo Davoli <[email protected]>
----
diff -Naur linux-2.6.29-rc7-git4/arch/um/include/shared/kern_util.h linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/kern_util.h
--- linux-2.6.29-rc7-git4/arch/um/include/shared/kern_util.h 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/kern_util.h 2009-03-11 09:35:23.000000000 +0100
@@ -57,7 +57,7 @@
extern unsigned long to_irq_stack(unsigned long *mask_out);
extern unsigned long from_irq_stack(int nested);
-extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
+extern int syscall_trace(struct uml_pt_regs *regs, int entryexit);
extern int singlestepping(void *t);
extern void segv_handler(int sig, struct uml_pt_regs *regs);
diff -Naur linux-2.6.29-rc7-git4/arch/um/include/shared/ptrace_user.h linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/ptrace_user.h
--- linux-2.6.29-rc7-git4/arch/um/include/shared/ptrace_user.h 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/ptrace_user.h 2009-03-11 09:35:23.000000000 +0100
@@ -40,9 +40,20 @@
#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
#endif
+/* these constant should eventually enter in sys/ptrace.h */
+#ifndef PTRACE_SYSCALL_SKIPCALL
+#define PTRACE_SYSCALL_SKIPCALL 0x6
+#endif
+#ifndef PTRACE_SYSCALL_SKIPEXIT
+#define PTRACE_SYSCALL_SKIPEXIT 0x2
+#endif
+
void set_using_sysemu(int value);
int get_using_sysemu(void);
extern int sysemu_supported;
+void set_using_sysptvm(int value);
+int get_using_sysptvm(void);
+extern int sysptvm_supported;
#define SELECT_PTRACE_OPERATION(sysemu_mode, singlestep_mode) \
(((int[3][3] ) { \
diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/process.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/process.c
--- linux-2.6.29-rc7-git4/arch/um/kernel/process.c 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/process.c 2009-03-11 10:03:05.000000000 +0100
@@ -322,7 +322,9 @@
}
static atomic_t using_sysemu = ATOMIC_INIT(0);
+static atomic_t using_sysptvm = ATOMIC_INIT(0);
int sysemu_supported;
+int sysptvm_supported;
void set_using_sysemu(int value)
{
@@ -336,7 +338,18 @@
return atomic_read(&using_sysemu);
}
-static int proc_read_sysemu(char *buf, char **start, off_t offset, int size,int *eof, void *data)
+void set_using_sysptvm(int value)
+{
+ atomic_set(&using_sysptvm, value);
+}
+
+int get_using_sysptvm(void)
+{
+ return atomic_read(&using_sysptvm);
+}
+
+static int proc_read_sysemu(char *buf, char **start, off_t offset,
+ int size, int *eof, void *data)
{
if (snprintf(buf, size, "%d\n", get_using_sysemu()) < size)
/* No overflow */
@@ -345,7 +358,8 @@
return strlen(buf);
}
-static int proc_write_sysemu(struct file *file,const char __user *buf, unsigned long count,void *data)
+static int proc_write_sysemu(struct file *file, const char __user *buf,
+ unsigned long count, void *data)
{
char tmp[2];
@@ -358,27 +372,63 @@
return count;
}
-int __init make_proc_sysemu(void)
+
+static int proc_read_sysptvm(char *buf, char **start, off_t offset,
+ int size, int *eof, void *data)
{
- struct proc_dir_entry *ent;
- if (!sysemu_supported)
- return 0;
+ int sysptvm = (get_using_sysptvm() != 0);
+ if (snprintf(buf, size, "%d\n", sysptvm) < size)
+ /* No overflow */
+ *eof = 1;
- ent = create_proc_entry("sysemu", 0600, NULL);
+ return strlen(buf);
+}
- if (ent == NULL)
- {
- printk(KERN_WARNING "Failed to register /proc/sysemu\n");
- return 0;
- }
+static int proc_write_sysptvm(struct file *file, const char __user *buf,
+ unsigned long count, void *data)
+{
+ char tmp[2];
+
+ if (copy_from_user(tmp, buf, 1))
+ return -EFAULT;
+
+ if (tmp[0] == '0')
+ set_using_sysptvm(0);
+ if (tmp[0] == '1')
+ set_using_sysemu(/* XXX */ 6);
+ /* We use the first char, but pretend to write everything */
+ return count;
+}
- ent->read_proc = proc_read_sysemu;
- ent->write_proc = proc_write_sysemu;
+int __init make_proc_sysemu_or_sysptvm(void)
+{
+ struct proc_dir_entry *ent;
+ if (sysptvm_supported) {
+ ent = create_proc_entry("sysptvm", 0600, NULL);
+
+ if (ent == NULL) {
+ printk(KERN_WARNING "Failed to register /proc/sysptvm\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysptvm;
+ ent->write_proc = proc_write_sysptvm;
+ } else if (sysemu_supported) {
+ ent = create_proc_entry("sysemu", 0600, NULL);
+
+ if (ent == NULL) {
+ printk(KERN_WARNING "Failed to register /proc/sysemu\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysemu;
+ ent->write_proc = proc_write_sysemu;
+ }
return 0;
}
-late_initcall(make_proc_sysemu);
+late_initcall(make_proc_sysemu_or_sysptvm);
int singlestepping(void * t)
{
diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/ptrace.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/ptrace.c
--- linux-2.6.29-rc7-git4/arch/um/kernel/ptrace.c 2009-03-11 09:30:19.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/ptrace.c 2009-03-11 09:35:23.000000000 +0100
@@ -81,6 +81,8 @@
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace &= ~PT_SYSCALL_MASK;
+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
child->exit_code = data;
wake_up_process(child);
ret = 0;
@@ -107,7 +109,9 @@
ret = -EIO;
if (!valid_signal(data))
break;
+ child->ptrace &= ~PT_SYSCALL_MASK;
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
set_singlestepping(child, 1);
child->exit_code = data;
/* give it a chance to run. */
@@ -250,7 +254,7 @@
* XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
* PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
*/
-void syscall_trace(struct uml_pt_regs *regs, int entryexit)
+int syscall_trace(struct uml_pt_regs *regs, int entryexit)
{
int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
int tracesysgood;
@@ -272,10 +276,13 @@
send_sigtrap(current, regs, 0);
if (!test_thread_flag(TIF_SYSCALL_TRACE))
- return;
+ return 0;
if (!(current->ptrace & PT_PTRACED))
- return;
+ return 0;
+
+ if (entryexit && (current->ptrace & PT_SYSCALL_SKIPEXIT))
+ return 0;
/*
* the 0x80 provides a way for the tracing parent to distinguish
@@ -296,4 +303,8 @@
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
+ if (!entryexit && (current->ptrace & PT_SYSCALL_SKIPCALL))
+ return 1;
+ else
+ return 0;
}
diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/skas/syscall.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/skas/syscall.c
--- linux-2.6.29-rc7-git4/arch/um/kernel/skas/syscall.c 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/skas/syscall.c 2009-03-11 09:41:29.000000000 +0100
@@ -17,8 +17,9 @@
struct pt_regs *regs = container_of(r, struct pt_regs, regs);
long result;
int syscall;
+ int skip_call;
- syscall_trace(r, 0);
+ skip_call = syscall_trace(r, 0);
/*
* This should go in the declaration of syscall, but when I do that,
@@ -29,12 +30,15 @@
* gcc version 4.0.1 20050727 (Red Hat 4.0.1-5)
* in case it's a compiler bug.
*/
- syscall = UPT_SYSCALL_NR(r);
- if ((syscall >= NR_syscalls) || (syscall < 0))
- result = -ENOSYS;
- else result = EXECUTE_SYSCALL(syscall, regs);
+ if (skip_call == 0) {
+ syscall = UPT_SYSCALL_NR(r);
+ if ((syscall >= NR_syscalls) || (syscall < 0))
+ result = -ENOSYS;
+ else
+ result = EXECUTE_SYSCALL(syscall, regs);
- REGS_SET_SYSCALL_RETURN(r->gp, result);
+ REGS_SET_SYSCALL_RETURN(r->gp, result);
+ }
syscall_trace(r, 1);
}
diff -Naur linux-2.6.29-rc7-git4/arch/um/os-Linux/skas/process.c linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/skas/process.c
--- linux-2.6.29-rc7-git4/arch/um/os-Linux/skas/process.c 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/skas/process.c 2009-03-11 09:35:23.000000000 +0100
@@ -157,7 +157,7 @@
* (in local_using_sysemu
*/
static void handle_trap(int pid, struct uml_pt_regs *regs,
- int local_using_sysemu)
+ int local_using_sysptvm_or_sysemu)
{
int err, status;
@@ -167,7 +167,7 @@
/* Mark this as a syscall */
UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
- if (!local_using_sysemu)
+ if (!local_using_sysptvm_or_sysemu)
{
err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
__NR_getpid);
@@ -354,6 +354,7 @@
int err, status, op, pid = userspace_pid[0];
/* To prevent races if using_sysemu changes under us.*/
int local_using_sysemu;
+ int local_using_sysptvm;
if (getitimer(ITIMER_VIRTUAL, &timer))
printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
@@ -375,11 +376,12 @@
/* Now we set local_using_sysemu to be used for one loop */
local_using_sysemu = get_using_sysemu();
+ local_using_sysptvm = get_using_sysptvm();
op = SELECT_PTRACE_OPERATION(local_using_sysemu,
singlestepping(NULL));
- if (ptrace(op, pid, 0, 0)) {
+ if (ptrace(op, pid, local_using_sysptvm, 0)) {
printk(UM_KERN_ERR "userspace - ptrace continue "
"failed, op = %d, errno = %d\n", op, errno);
fatal_sigsegv();
diff -Naur linux-2.6.29-rc7-git4/arch/um/os-Linux/start_up.c linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/start_up.c
--- linux-2.6.29-rc7-git4/arch/um/os-Linux/start_up.c 2009-03-11 09:25:27.000000000 +0100
+++ linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/start_up.c 2009-03-11 09:58:40.000000000 +0100
@@ -198,6 +198,34 @@
" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n"
" information.\n\n");
+/* Changed only during early boot */
+static int force_sysptvm_disabled;
+
+static int __init nosysptvm_cmd_param(char *str, int* add)
+{
+ force_sysptvm_disabled = 1;
+ return 0;
+}
+
+__uml_setup("nosysptvm", nosysptvm_cmd_param,
+"nosysptvm\n"
+" Turns off syscall emulation tags for ptrace (ptrace_vm) on.\n"
+" Ptrace_vm is a feature introduced by Renzo Davoli. It changes\n"
+" behaviour of ptrace() and helps reducing host context switch rate.\n\n");
+
+static int use_sysemu;
+
+static int __init usesysemu_cmd_param(char *str, int* add)
+{
+ use_sysemu = 1;
+ return 0;
+}
+
+__uml_setup("usesysemu", usesysemu_cmd_param,
+"usesysemu\n"
+" Use sysemu instead of sysptvm even when the kernel supports it.\n\n"
+);
+
static void __init check_sysemu(void)
{
unsigned long regs[MAX_REG_NR];
@@ -293,6 +321,114 @@
non_fatal("missing\n");
}
+/*
+ * test thread code. This thread is started only to test
+ * which features are provided by the linux kernel
+ */
+static int sysptvm_child(void *arg)
+{
+ int *featurep = arg;
+ int p[2] = {-1, -1};
+ pid_t pid = os_getpid();
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
+ perror("ptrace test_ptracemulti");
+ kill(pid, SIGKILL);
+ }
+ kill(pid, SIGSTOP);
+ *featurep = 0;
+ os_getpid();
+ /*
+ * if it reaches this point in 1 stop it means that
+ * PTRACE_SYSCALL_SKIPEXIT works
+ */
+ *featurep = PTRACE_SYSCALL_SKIPEXIT;
+ pipe(p);
+ /*
+ * if after a PTRACE_SYSCALL_SKIPCALL p[0] is already <0
+ * pipe has been really skipped
+ */
+ if (p[0] < 0)
+ *featurep = PTRACE_SYSCALL_SKIPCALL;
+ else { /* clean up everything */
+ close(p[0]);
+ close(p[1]);
+ }
+ return 0;
+}
+
+/*
+ * kernel feature test:
+ * it returns:
+ * -1 error
+ * 0 old PTRACE_SYSCALL (addr is ignored)
+ * PTRACE_SYSCALL_SKIPEXIT: just skip_exit is provided
+ * PTRACE_SYSCALL_SKIPCALL: the entire syntax is implemented
+ * by the running kernel
+ */
+static int __init test_ptrace_sysptvm(void)
+{
+ int pid, status, rv, feature;
+ static char stack[1024];
+ feature = 0;
+
+ pid = clone(sysptvm_child, &stack[1020], SIGCHLD | CLONE_VM, &feature);
+ if (pid < 0)
+ return 0;
+ if (waitpid(pid, &status, WUNTRACED) < 0) {
+ kill(pid, SIGKILL);
+ return 0;
+ }
+ /* restart and wait for the next syscall (getpid)*/
+ rv = ptrace(PTRACE_SYSCALL, pid, 0, 0);
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /* try to skip the exit call */
+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPEXIT, 0);
+ if (rv < 0)
+ goto out;
+ /* wait for the next stop */
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /*
+ * if feature is already 0 it means that this is the exit call,
+ * and it has not been skipped, otherwise this is the
+ * entry call for the system call "time"
+ */
+ if (feature < PTRACE_SYSCALL_SKIPEXIT)
+ goto out;
+ /* restart (time) and and try to skip the entire call */
+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPCALL, 0);
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+out:
+ ptrace(PTRACE_KILL, pid, 0, 0);
+ /* eliminate zombie */
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+ return feature;
+}
+
+static int __init check_sysptvm(void)
+{
+ int feature = test_ptrace_sysptvm();
+
+ non_fatal("Checking ptrace new tags for syscall emulation...");
+ if (feature == PTRACE_SYSCALL_SKIPCALL) {
+ sysptvm_supported = 1;
+ non_fatal("OK");
+ if (!force_sysptvm_disabled) {
+ set_using_sysptvm(PTRACE_SYSCALL_SKIPCALL);
+ non_fatal("\n");
+ return 1;
+ } else {
+ non_fatal(" (disabled)\n");
+ return 0;
+ }
+ } else
+ non_fatal("unsupported\n");
+ return 0;
+}
+
static void __init check_ptrace(void)
{
int pid, syscall, n, status;
@@ -330,7 +466,8 @@
}
stop_ptraced_child(pid, 0, 1);
non_fatal("OK\n");
- check_sysemu();
+ if (use_sysemu || !check_sysptvm())
+ check_sysemu();
}
extern void check_tmpexec(void);
On Wed, Mar 11, 2009 at 02:41:38PM +0100, Renzo Davoli wrote:
>> Please check Documentation/CodingStyle. Every second line above
>> violates it. scripts/checkpatch.pl can help out with the more
>> obvious ones.
>Ingo,
>
>Thank you for your comment.
>You are right, I beg your pardon.
>I have updated the patch, now it should be (more) consistent
>with the Coding Style specifications.
You can use scripts/checkpatch.pl to check it before sending.
>
>This patch adds the new PTRACE_VM_SKIPCALL and PTRACE_VM_SKIPEXIT
>tags for ptrace's addr parameter.
>In this way it is possible to (eventually) get rid of PTRACE_SYSEMU
>PTRACE_SYSEMU_SINGLESTEP, while providing not only the same features
>but a more general support for Virtual Machines.
>Part#2: user-mode Linux support.
>User-mode Linux by this patch uses PTRACE_VM of the hosting operating system
>and provides PTRACE_VM to its processes.
>UML tests at startup which features are provided and uses PTRACE_VM or
>PTRACE_SYSEMU (or nothing). PTRACE_VM and/or PTRACE_SYSEMU support can be
>disabled by command line flags.
>
So what? PTRACE_VM is only supported in UML with this patch,
UML still has to use PTRACE_SYSEMU on x86_32.
Am I missing something? :)
>
>Signed-off-by: Renzo Davoli <[email protected]>
Minor comments below.
>----
>diff -Naur linux-2.6.29-rc7-git4/arch/um/include/shared/kern_util.h linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/kern_util.h
>--- linux-2.6.29-rc7-git4/arch/um/include/shared/kern_util.h 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/kern_util.h 2009-03-11 09:35:23.000000000 +0100
>@@ -57,7 +57,7 @@
> extern unsigned long to_irq_stack(unsigned long *mask_out);
> extern unsigned long from_irq_stack(int nested);
>
>-extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
>+extern int syscall_trace(struct uml_pt_regs *regs, int entryexit);
> extern int singlestepping(void *t);
>
> extern void segv_handler(int sig, struct uml_pt_regs *regs);
>diff -Naur linux-2.6.29-rc7-git4/arch/um/include/shared/ptrace_user.h linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/ptrace_user.h
>--- linux-2.6.29-rc7-git4/arch/um/include/shared/ptrace_user.h 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/include/shared/ptrace_user.h 2009-03-11 09:35:23.000000000 +0100
>@@ -40,9 +40,20 @@
> #define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
> #endif
>
>+/* these constant should eventually enter in sys/ptrace.h */
>+#ifndef PTRACE_SYSCALL_SKIPCALL
>+#define PTRACE_SYSCALL_SKIPCALL 0x6
>+#endif
>+#ifndef PTRACE_SYSCALL_SKIPEXIT
>+#define PTRACE_SYSCALL_SKIPEXIT 0x2
>+#endif
>+
> void set_using_sysemu(int value);
> int get_using_sysemu(void);
> extern int sysemu_supported;
>+void set_using_sysptvm(int value);
>+int get_using_sysptvm(void);
>+extern int sysptvm_supported;
>
> #define SELECT_PTRACE_OPERATION(sysemu_mode, singlestep_mode) \
> (((int[3][3] ) { \
>diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/process.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/process.c
>--- linux-2.6.29-rc7-git4/arch/um/kernel/process.c 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/process.c 2009-03-11 10:03:05.000000000 +0100
>@@ -322,7 +322,9 @@
> }
>
> static atomic_t using_sysemu = ATOMIC_INIT(0);
>+static atomic_t using_sysptvm = ATOMIC_INIT(0);
> int sysemu_supported;
>+int sysptvm_supported;
>
> void set_using_sysemu(int value)
> {
>@@ -336,7 +338,18 @@
> return atomic_read(&using_sysemu);
> }
>
>-static int proc_read_sysemu(char *buf, char **start, off_t offset, int size,int *eof, void *data)
>+void set_using_sysptvm(int value)
>+{
>+ atomic_set(&using_sysptvm, value);
>+}
>+
>+int get_using_sysptvm(void)
>+{
>+ return atomic_read(&using_sysptvm);
>+}
How about making it boolean? AFAIK, you use it as a boolean.
>+
>+static int proc_read_sysemu(char *buf, char **start, off_t offset,
>+ int size, int *eof, void *data)
> {
> if (snprintf(buf, size, "%d\n", get_using_sysemu()) < size)
> /* No overflow */
>@@ -345,7 +358,8 @@
> return strlen(buf);
> }
>
>-static int proc_write_sysemu(struct file *file,const char __user *buf, unsigned long count,void *data)
>+static int proc_write_sysemu(struct file *file, const char __user *buf,
>+ unsigned long count, void *data)
> {
> char tmp[2];
>
>@@ -358,27 +372,63 @@
> return count;
> }
>
>-int __init make_proc_sysemu(void)
>+
>+static int proc_read_sysptvm(char *buf, char **start, off_t offset,
>+ int size, int *eof, void *data)
> {
>- struct proc_dir_entry *ent;
>- if (!sysemu_supported)
>- return 0;
>+ int sysptvm = (get_using_sysptvm() != 0);
>+ if (snprintf(buf, size, "%d\n", sysptvm) < size)
>+ /* No overflow */
>+ *eof = 1;
>
>- ent = create_proc_entry("sysemu", 0600, NULL);
>+ return strlen(buf);
>+}
>
>- if (ent == NULL)
>- {
>- printk(KERN_WARNING "Failed to register /proc/sysemu\n");
>- return 0;
>- }
>+static int proc_write_sysptvm(struct file *file, const char __user *buf,
>+ unsigned long count, void *data)
>+{
>+ char tmp[2];
>+
>+ if (copy_from_user(tmp, buf, 1))
>+ return -EFAULT;
>+
>+ if (tmp[0] == '0')
>+ set_using_sysptvm(0);
>+ if (tmp[0] == '1')
>+ set_using_sysemu(/* XXX */ 6);
>+ /* We use the first char, but pretend to write everything */
>+ return count;
>+}
>
>- ent->read_proc = proc_read_sysemu;
>- ent->write_proc = proc_write_sysemu;
>+int __init make_proc_sysemu_or_sysptvm(void)
>+{
>+ struct proc_dir_entry *ent;
>
>+ if (sysptvm_supported) {
>+ ent = create_proc_entry("sysptvm", 0600, NULL);
>+
>+ if (ent == NULL) {
>+ printk(KERN_WARNING "Failed to register /proc/sysptvm\n");
>+ return 0;
>+ }
>+
>+ ent->read_proc = proc_read_sysptvm;
>+ ent->write_proc = proc_write_sysptvm;
>+ } else if (sysemu_supported) {
>+ ent = create_proc_entry("sysemu", 0600, NULL);
>+
>+ if (ent == NULL) {
>+ printk(KERN_WARNING "Failed to register /proc/sysemu\n");
>+ return 0;
>+ }
>+
>+ ent->read_proc = proc_read_sysemu;
>+ ent->write_proc = proc_write_sysemu;
>+ }
> return 0;
> }
>
>-late_initcall(make_proc_sysemu);
>+late_initcall(make_proc_sysemu_or_sysptvm);
>
> int singlestepping(void * t)
> {
>diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/ptrace.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/ptrace.c
>--- linux-2.6.29-rc7-git4/arch/um/kernel/ptrace.c 2009-03-11 09:30:19.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/ptrace.c 2009-03-11 09:35:23.000000000 +0100
>@@ -81,6 +81,8 @@
> if (request == PTRACE_SYSCALL)
> set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
> else clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
>+ child->ptrace &= ~PT_SYSCALL_MASK;
>+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
28, ditto.
> child->exit_code = data;
> wake_up_process(child);
> ret = 0;
>@@ -107,7 +109,9 @@
> ret = -EIO;
> if (!valid_signal(data))
> break;
>+ child->ptrace &= ~PT_SYSCALL_MASK;
> clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
>+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
> set_singlestepping(child, 1);
> child->exit_code = data;
> /* give it a chance to run. */
>@@ -250,7 +254,7 @@
> * XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
> * PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
> */
>-void syscall_trace(struct uml_pt_regs *regs, int entryexit)
>+int syscall_trace(struct uml_pt_regs *regs, int entryexit)
> {
> int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
> int tracesysgood;
>@@ -272,10 +276,13 @@
> send_sigtrap(current, regs, 0);
>
> if (!test_thread_flag(TIF_SYSCALL_TRACE))
>- return;
>+ return 0;
>
> if (!(current->ptrace & PT_PTRACED))
>- return;
>+ return 0;
>+
>+ if (entryexit && (current->ptrace & PT_SYSCALL_SKIPEXIT))
>+ return 0;
>
> /*
> * the 0x80 provides a way for the tracing parent to distinguish
>@@ -296,4 +303,8 @@
> send_sig(current->exit_code, current, 1);
> current->exit_code = 0;
> }
>+ if (!entryexit && (current->ptrace & PT_SYSCALL_SKIPCALL))
>+ return 1;
>+ else
>+ return 0;
> }
>diff -Naur linux-2.6.29-rc7-git4/arch/um/kernel/skas/syscall.c linux-2.6.29-rc7-git4.vm2/arch/um/kernel/skas/syscall.c
>--- linux-2.6.29-rc7-git4/arch/um/kernel/skas/syscall.c 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/kernel/skas/syscall.c 2009-03-11 09:41:29.000000000 +0100
>@@ -17,8 +17,9 @@
> struct pt_regs *regs = container_of(r, struct pt_regs, regs);
> long result;
> int syscall;
>+ int skip_call;
>
>- syscall_trace(r, 0);
>+ skip_call = syscall_trace(r, 0);
>
> /*
> * This should go in the declaration of syscall, but when I do that,
>@@ -29,12 +30,15 @@
> * gcc version 4.0.1 20050727 (Red Hat 4.0.1-5)
> * in case it's a compiler bug.
> */
>- syscall = UPT_SYSCALL_NR(r);
>- if ((syscall >= NR_syscalls) || (syscall < 0))
>- result = -ENOSYS;
>- else result = EXECUTE_SYSCALL(syscall, regs);
>+ if (skip_call == 0) {
>+ syscall = UPT_SYSCALL_NR(r);
>+ if ((syscall >= NR_syscalls) || (syscall < 0))
>+ result = -ENOSYS;
>+ else
>+ result = EXECUTE_SYSCALL(syscall, regs);
>
>- REGS_SET_SYSCALL_RETURN(r->gp, result);
>+ REGS_SET_SYSCALL_RETURN(r->gp, result);
>+ }
>
> syscall_trace(r, 1);
> }
>diff -Naur linux-2.6.29-rc7-git4/arch/um/os-Linux/skas/process.c linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/skas/process.c
>--- linux-2.6.29-rc7-git4/arch/um/os-Linux/skas/process.c 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/skas/process.c 2009-03-11 09:35:23.000000000 +0100
>@@ -157,7 +157,7 @@
> * (in local_using_sysemu
> */
> static void handle_trap(int pid, struct uml_pt_regs *regs,
>- int local_using_sysemu)
>+ int local_using_sysptvm_or_sysemu)
This argument name is too long. :)
> {
> int err, status;
>
>@@ -167,7 +167,7 @@
> /* Mark this as a syscall */
> UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
>
>- if (!local_using_sysemu)
>+ if (!local_using_sysptvm_or_sysemu)
> {
> err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
> __NR_getpid);
>@@ -354,6 +354,7 @@
> int err, status, op, pid = userspace_pid[0];
> /* To prevent races if using_sysemu changes under us.*/
> int local_using_sysemu;
>+ int local_using_sysptvm;
>
> if (getitimer(ITIMER_VIRTUAL, &timer))
> printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
>@@ -375,11 +376,12 @@
>
> /* Now we set local_using_sysemu to be used for one loop */
> local_using_sysemu = get_using_sysemu();
>+ local_using_sysptvm = get_using_sysptvm();
>
> op = SELECT_PTRACE_OPERATION(local_using_sysemu,
> singlestepping(NULL));
>
>- if (ptrace(op, pid, 0, 0)) {
>+ if (ptrace(op, pid, local_using_sysptvm, 0)) {
> printk(UM_KERN_ERR "userspace - ptrace continue "
> "failed, op = %d, errno = %d\n", op, errno);
> fatal_sigsegv();
>diff -Naur linux-2.6.29-rc7-git4/arch/um/os-Linux/start_up.c linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/start_up.c
>--- linux-2.6.29-rc7-git4/arch/um/os-Linux/start_up.c 2009-03-11 09:25:27.000000000 +0100
>+++ linux-2.6.29-rc7-git4.vm2/arch/um/os-Linux/start_up.c 2009-03-11 09:58:40.000000000 +0100
>@@ -198,6 +198,34 @@
> " See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n"
> " information.\n\n");
>
>+/* Changed only during early boot */
>+static int force_sysptvm_disabled;
>+
>+static int __init nosysptvm_cmd_param(char *str, int* add)
>+{
>+ force_sysptvm_disabled = 1;
>+ return 0;
>+}
>+
>+__uml_setup("nosysptvm", nosysptvm_cmd_param,
>+"nosysptvm\n"
>+" Turns off syscall emulation tags for ptrace (ptrace_vm) on.\n"
>+" Ptrace_vm is a feature introduced by Renzo Davoli. It changes\n"
>+" behaviour of ptrace() and helps reducing host context switch rate.\n\n");
>+
>+static int use_sysemu;
>+
>+static int __init usesysemu_cmd_param(char *str, int* add)
I don't like this function name either. :(
>+{
>+ use_sysemu = 1;
>+ return 0;
>+}
>+
>+__uml_setup("usesysemu", usesysemu_cmd_param,
>+"usesysemu\n"
>+" Use sysemu instead of sysptvm even when the kernel supports it.\n\n"
>+);
>+
> static void __init check_sysemu(void)
> {
> unsigned long regs[MAX_REG_NR];
>@@ -293,6 +321,114 @@
> non_fatal("missing\n");
> }
>
>+/*
>+ * test thread code. This thread is started only to test
>+ * which features are provided by the linux kernel
>+ */
>+static int sysptvm_child(void *arg)
>+{
>+ int *featurep = arg;
>+ int p[2] = {-1, -1};
>+ pid_t pid = os_getpid();
>+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
>+ perror("ptrace test_ptracemulti");
>+ kill(pid, SIGKILL);
>+ }
>+ kill(pid, SIGSTOP);
>+ *featurep = 0;
>+ os_getpid();
>+ /*
>+ * if it reaches this point in 1 stop it means that
>+ * PTRACE_SYSCALL_SKIPEXIT works
>+ */
>+ *featurep = PTRACE_SYSCALL_SKIPEXIT;
>+ pipe(p);
>+ /*
>+ * if after a PTRACE_SYSCALL_SKIPCALL p[0] is already <0
>+ * pipe has been really skipped
>+ */
>+ if (p[0] < 0)
>+ *featurep = PTRACE_SYSCALL_SKIPCALL;
>+ else { /* clean up everything */
>+ close(p[0]);
>+ close(p[1]);
>+ }
>+ return 0;
>+}
>+
>+/*
>+ * kernel feature test:
>+ * it returns:
>+ * -1 error
>+ * 0 old PTRACE_SYSCALL (addr is ignored)
>+ * PTRACE_SYSCALL_SKIPEXIT: just skip_exit is provided
>+ * PTRACE_SYSCALL_SKIPCALL: the entire syntax is implemented
>+ * by the running kernel
>+ */
>+static int __init test_ptrace_sysptvm(void)
How about check_ptrace_sysptvm? Since it is consistent with
other check_XXX functions.
>+{
>+ int pid, status, rv, feature;
>+ static char stack[1024];
>+ feature = 0;
>+
>+ pid = clone(sysptvm_child, &stack[1020], SIGCHLD | CLONE_VM, &feature);
>+ if (pid < 0)
>+ return 0;
>+ if (waitpid(pid, &status, WUNTRACED) < 0) {
>+ kill(pid, SIGKILL);
>+ return 0;
>+ }
>+ /* restart and wait for the next syscall (getpid)*/
>+ rv = ptrace(PTRACE_SYSCALL, pid, 0, 0);
>+ if (waitpid(pid, &status, WUNTRACED) < 0)
>+ goto out;
>+ /* try to skip the exit call */
>+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPEXIT, 0);
>+ if (rv < 0)
>+ goto out;
>+ /* wait for the next stop */
>+ if (waitpid(pid, &status, WUNTRACED) < 0)
>+ goto out;
>+ /*
>+ * if feature is already 0 it means that this is the exit call,
>+ * and it has not been skipped, otherwise this is the
>+ * entry call for the system call "time"
>+ */
>+ if (feature < PTRACE_SYSCALL_SKIPEXIT)
>+ goto out;
>+ /* restart (time) and and try to skip the entire call */
>+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPCALL, 0);
>+ if (waitpid(pid, &status, WUNTRACED) < 0)
>+ return 0;
>+out:
>+ ptrace(PTRACE_KILL, pid, 0, 0);
>+ /* eliminate zombie */
>+ if (waitpid(pid, &status, WUNTRACED) < 0)
>+ return 0;
>+ return feature;
>+}
>+
>+static int __init check_sysptvm(void)
>+{
>+ int feature = test_ptrace_sysptvm();
>+
>+ non_fatal("Checking ptrace new tags for syscall emulation...");
>+ if (feature == PTRACE_SYSCALL_SKIPCALL) {
>+ sysptvm_supported = 1;
>+ non_fatal("OK");
>+ if (!force_sysptvm_disabled) {
>+ set_using_sysptvm(PTRACE_SYSCALL_SKIPCALL);
>+ non_fatal("\n");
>+ return 1;
>+ } else {
>+ non_fatal(" (disabled)\n");
>+ return 0;
>+ }
>+ } else
>+ non_fatal("unsupported\n");
>+ return 0;
>+}
>+
> static void __init check_ptrace(void)
> {
> int pid, syscall, n, status;
>@@ -330,7 +466,8 @@
> }
> stop_ptraced_child(pid, 0, 1);
> non_fatal("OK\n");
>- check_sysemu();
>+ if (use_sysemu || !check_sysptvm())
>+ check_sysemu();
> }
>
> extern void check_tmpexec(void);
--
Do what you love, f**k the rest! F**k the regulations!
Dear Cong,
Thank you for the detailed analysis of the code.
I'll change the code taking care of your observations asap.
On Mon, Mar 16, 2009 at 04:15:08PM +0800, Am??rico Wang wrote:
> >I have updated the patch, now it should be (more) consistent
> >with the Coding Style specifications.
> You can use scripts/checkpatch.pl to check it before sending.
I read the coding style document and I used the perl script.
However, the script is not able to cope with all the style specifications
and I may have missed something more.
>
> >UML tests at startup which features are provided and uses PTRACE_VM or
> >PTRACE_SYSEMU (or nothing). PTRACE_VM and/or PTRACE_SYSEMU support can be
> >disabled by command line flags.
> So what? PTRACE_VM is only supported in UML with this patch,
> UML still has to use PTRACE_SYSEMU on x86_32.
>
> Am I missing something? :)
This patch [2/2] is for UML (host and guest). Patch #1 provides PTRACE_VM
for all the architectures supporting ptrace via tracehook.
By applying both patches PTRACE_VM is available in the following architectures:
x86*, sparc*, s390, powerpc*, ia64, sh* and um.
(I have not tested all these architectures, but the patch applies to the core ptrace
code, shared by all of them).
Ptrace_vm then provides the same speedup of PTRACE_SYSEMU extending its support:
- to other architectures: ports of UML or similar code for other architectures can
use it
- to other applications: PTRACE_SYSEMU supports the virtualization of all the system calls
while by PTRACE_VM the VM monitor can virtualize some of the system calls, depending on
some condition e.g. the value of a parameter. It is possible in this way give a faster
implementation to partial virtual machines like my umview.
With patch #2 user-mode linux also uses ptrace_vm where available.
renzo
On Mon, Mar 16, 2009 at 01:17:32PM +0100, Renzo Davoli wrote:
>Dear Cong,
>
>Thank you for the detailed analysis of the code.
>I'll change the code taking care of your observations asap.
You are so welcome. :)
>
>On Mon, Mar 16, 2009 at 04:15:08PM +0800, Am??rico Wang wrote:
>> >I have updated the patch, now it should be (more) consistent
>> >with the Coding Style specifications.
>> You can use scripts/checkpatch.pl to check it before sending.
>I read the coding style document and I used the perl script.
>However, the script is not able to cope with all the style specifications
>and I may have missed something more.
>>
>> >UML tests at startup which features are provided and uses PTRACE_VM or
>> >PTRACE_SYSEMU (or nothing). PTRACE_VM and/or PTRACE_SYSEMU support can be
>> >disabled by command line flags.
>> So what? PTRACE_VM is only supported in UML with this patch,
>> UML still has to use PTRACE_SYSEMU on x86_32.
>>
>> Am I missing something? :)
>This patch [2/2] is for UML (host and guest). Patch #1 provides PTRACE_VM
>for all the architectures supporting ptrace via tracehook.
>By applying both patches PTRACE_VM is available in the following architectures:
>x86*, sparc*, s390, powerpc*, ia64, sh* and um.
>(I have not tested all these architectures, but the patch applies to the core ptrace
>code, shared by all of them).
Ok then. I am not familiar with tracehooks.
>Ptrace_vm then provides the same speedup of PTRACE_SYSEMU extending its support:
>- to other architectures: ports of UML or similar code for other architectures can
>use it
>- to other applications: PTRACE_SYSEMU supports the virtualization of all the system calls
>while by PTRACE_VM the VM monitor can virtualize some of the system calls, depending on
>some condition e.g. the value of a parameter. It is possible in this way give a faster
>implementation to partial virtual machines like my umview.
>
>With patch #2 user-mode linux also uses ptrace_vm where available.
>
Thanks for your explanations!
--
Do what you love, f**k the rest! F**k the regulations!
Patch rebased on 2.6.29. I have fixed the code following Cong's suggestion.
renzo
Although get_using_sysptvm is used as a boolean, I have left it int just
for the sake of simmetry with get_using_sysemu.
It could be safely changed to boolean at any time.
Signed-off-by: Renzo Davoli <[email protected]>
---
diff -Naur linux-2.6.29-vm/arch/um/include/shared/kern_util.h linux-2.6.29-vm2/arch/um/include/shared/kern_util.h
--- linux-2.6.29-vm/arch/um/include/shared/kern_util.h 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/include/shared/kern_util.h 2009-03-24 22:12:50.000000000 +0100
@@ -57,7 +57,7 @@
extern unsigned long to_irq_stack(unsigned long *mask_out);
extern unsigned long from_irq_stack(int nested);
-extern void syscall_trace(struct uml_pt_regs *regs, int entryexit);
+extern int syscall_trace(struct uml_pt_regs *regs, int entryexit);
extern int singlestepping(void *t);
extern void segv_handler(int sig, struct uml_pt_regs *regs);
diff -Naur linux-2.6.29-vm/arch/um/include/shared/ptrace_user.h linux-2.6.29-vm2/arch/um/include/shared/ptrace_user.h
--- linux-2.6.29-vm/arch/um/include/shared/ptrace_user.h 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/include/shared/ptrace_user.h 2009-03-24 22:12:50.000000000 +0100
@@ -40,9 +40,20 @@
#define PTRACE_OLDSETOPTIONS PTRACE_SETOPTIONS
#endif
+/* these constant should eventually enter in sys/ptrace.h */
+#ifndef PTRACE_SYSCALL_SKIPCALL
+#define PTRACE_SYSCALL_SKIPCALL 0x6
+#endif
+#ifndef PTRACE_SYSCALL_SKIPEXIT
+#define PTRACE_SYSCALL_SKIPEXIT 0x2
+#endif
+
void set_using_sysemu(int value);
int get_using_sysemu(void);
extern int sysemu_supported;
+void set_using_sysptvm(int value);
+int get_using_sysptvm(void);
+extern int sysptvm_supported;
#define SELECT_PTRACE_OPERATION(sysemu_mode, singlestep_mode) \
(((int[3][3] ) { \
diff -Naur linux-2.6.29-vm/arch/um/kernel/process.c linux-2.6.29-vm2/arch/um/kernel/process.c
--- linux-2.6.29-vm/arch/um/kernel/process.c 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/kernel/process.c 2009-03-24 22:12:50.000000000 +0100
@@ -322,7 +322,9 @@
}
static atomic_t using_sysemu = ATOMIC_INIT(0);
+static atomic_t using_sysptvm = ATOMIC_INIT(0);
int sysemu_supported;
+int sysptvm_supported;
void set_using_sysemu(int value)
{
@@ -336,7 +338,18 @@
return atomic_read(&using_sysemu);
}
-static int proc_read_sysemu(char *buf, char **start, off_t offset, int size,int *eof, void *data)
+void set_using_sysptvm(int value)
+{
+ atomic_set(&using_sysptvm, value);
+}
+
+int get_using_sysptvm(void)
+{
+ return atomic_read(&using_sysptvm);
+}
+
+static int proc_read_sysemu(char *buf, char **start, off_t offset,
+ int size, int *eof, void *data)
{
if (snprintf(buf, size, "%d\n", get_using_sysemu()) < size)
/* No overflow */
@@ -345,7 +358,8 @@
return strlen(buf);
}
-static int proc_write_sysemu(struct file *file,const char __user *buf, unsigned long count,void *data)
+static int proc_write_sysemu(struct file *file, const char __user *buf,
+ unsigned long count, void *data)
{
char tmp[2];
@@ -358,27 +372,63 @@
return count;
}
-int __init make_proc_sysemu(void)
+
+static int proc_read_sysptvm(char *buf, char **start, off_t offset,
+ int size, int *eof, void *data)
{
- struct proc_dir_entry *ent;
- if (!sysemu_supported)
- return 0;
+ int sysptvm = (get_using_sysptvm() != 0);
+ if (snprintf(buf, size, "%d\n", sysptvm) < size)
+ /* No overflow */
+ *eof = 1;
- ent = create_proc_entry("sysemu", 0600, NULL);
+ return strlen(buf);
+}
- if (ent == NULL)
- {
- printk(KERN_WARNING "Failed to register /proc/sysemu\n");
- return 0;
- }
+static int proc_write_sysptvm(struct file *file, const char __user *buf,
+ unsigned long count, void *data)
+{
+ char tmp[2];
+
+ if (copy_from_user(tmp, buf, 1))
+ return -EFAULT;
+
+ if (tmp[0] == '0')
+ set_using_sysptvm(0);
+ if (tmp[0] == '1')
+ set_using_sysemu(/* XXX */ 6);
+ /* We use the first char, but pretend to write everything */
+ return count;
+}
- ent->read_proc = proc_read_sysemu;
- ent->write_proc = proc_write_sysemu;
+int __init make_proc_sysemu_or_sysptvm(void)
+{
+ struct proc_dir_entry *ent;
+ if (sysptvm_supported) {
+ ent = create_proc_entry("sysptvm", 0600, NULL);
+
+ if (ent == NULL) {
+ printk(KERN_WARNING "Failed to register /proc/sysptvm\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysptvm;
+ ent->write_proc = proc_write_sysptvm;
+ } else if (sysemu_supported) {
+ ent = create_proc_entry("sysemu", 0600, NULL);
+
+ if (ent == NULL) {
+ printk(KERN_WARNING "Failed to register /proc/sysemu\n");
+ return 0;
+ }
+
+ ent->read_proc = proc_read_sysemu;
+ ent->write_proc = proc_write_sysemu;
+ }
return 0;
}
-late_initcall(make_proc_sysemu);
+late_initcall(make_proc_sysemu_or_sysptvm);
int singlestepping(void * t)
{
diff -Naur linux-2.6.29-vm/arch/um/kernel/ptrace.c linux-2.6.29-vm2/arch/um/kernel/ptrace.c
--- linux-2.6.29-vm/arch/um/kernel/ptrace.c 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/kernel/ptrace.c 2009-03-24 22:14:51.000000000 +0100
@@ -81,6 +81,8 @@
if (request == PTRACE_SYSCALL)
set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
else clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace &= ~PT_SYSCALL_MASK;
+ child->ptrace |= PTRACE2PT_SYSCALL(addr);
child->exit_code = data;
wake_up_process(child);
ret = 0;
@@ -107,7 +109,9 @@
ret = -EIO;
if (!valid_signal(data))
break;
+ child->ptrace &= ~PT_SYSCALL_MASK;
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ child->ptrace |= (addr & PTRACE_SYSCALL_MASK) << 28;
set_singlestepping(child, 1);
child->exit_code = data;
/* give it a chance to run. */
@@ -250,7 +254,7 @@
* XXX Check PT_DTRACE vs TIF_SINGLESTEP for singlestepping check and
* PT_PTRACED vs TIF_SYSCALL_TRACE for syscall tracing check
*/
-void syscall_trace(struct uml_pt_regs *regs, int entryexit)
+int syscall_trace(struct uml_pt_regs *regs, int entryexit)
{
int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
int tracesysgood;
@@ -272,10 +276,13 @@
send_sigtrap(current, regs, 0);
if (!test_thread_flag(TIF_SYSCALL_TRACE))
- return;
+ return 0;
if (!(current->ptrace & PT_PTRACED))
- return;
+ return 0;
+
+ if (entryexit && (current->ptrace & PT_SYSCALL_SKIPEXIT))
+ return 0;
/*
* the 0x80 provides a way for the tracing parent to distinguish
@@ -296,4 +303,8 @@
send_sig(current->exit_code, current, 1);
current->exit_code = 0;
}
+ if (!entryexit && (current->ptrace & PT_SYSCALL_SKIPCALL))
+ return 1;
+ else
+ return 0;
}
diff -Naur linux-2.6.29-vm/arch/um/kernel/skas/syscall.c linux-2.6.29-vm2/arch/um/kernel/skas/syscall.c
--- linux-2.6.29-vm/arch/um/kernel/skas/syscall.c 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/kernel/skas/syscall.c 2009-03-24 22:12:50.000000000 +0100
@@ -17,8 +17,9 @@
struct pt_regs *regs = container_of(r, struct pt_regs, regs);
long result;
int syscall;
+ int skip_call;
- syscall_trace(r, 0);
+ skip_call = syscall_trace(r, 0);
/*
* This should go in the declaration of syscall, but when I do that,
@@ -29,12 +30,15 @@
* gcc version 4.0.1 20050727 (Red Hat 4.0.1-5)
* in case it's a compiler bug.
*/
- syscall = UPT_SYSCALL_NR(r);
- if ((syscall >= NR_syscalls) || (syscall < 0))
- result = -ENOSYS;
- else result = EXECUTE_SYSCALL(syscall, regs);
+ if (skip_call == 0) {
+ syscall = UPT_SYSCALL_NR(r);
+ if ((syscall >= NR_syscalls) || (syscall < 0))
+ result = -ENOSYS;
+ else
+ result = EXECUTE_SYSCALL(syscall, regs);
- REGS_SET_SYSCALL_RETURN(r->gp, result);
+ REGS_SET_SYSCALL_RETURN(r->gp, result);
+ }
syscall_trace(r, 1);
}
diff -Naur linux-2.6.29-vm/arch/um/os-Linux/skas/process.c linux-2.6.29-vm2/arch/um/os-Linux/skas/process.c
--- linux-2.6.29-vm/arch/um/os-Linux/skas/process.c 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/os-Linux/skas/process.c 2009-03-24 22:24:07.000000000 +0100
@@ -153,11 +153,11 @@
}
/*
- * To use the same value of using_sysemu as the caller, ask it that value
- * (in local_using_sysemu
+ * To use the same value of using_sysptvm or using_sysemu as the caller, i
+ * ask it that value in use_sys_ptvm_or_emu
*/
static void handle_trap(int pid, struct uml_pt_regs *regs,
- int local_using_sysemu)
+ int use_sys_ptvm_or_emu)
{
int err, status;
@@ -167,7 +167,7 @@
/* Mark this as a syscall */
UPT_SYSCALL_NR(regs) = PT_SYSCALL_NR(regs->gp);
- if (!local_using_sysemu)
+ if (!use_sys_ptvm_or_emu)
{
err = ptrace(PTRACE_POKEUSR, pid, PT_SYSCALL_NR_OFFSET,
__NR_getpid);
@@ -354,6 +354,7 @@
int err, status, op, pid = userspace_pid[0];
/* To prevent races if using_sysemu changes under us.*/
int local_using_sysemu;
+ int local_using_sysptvm;
if (getitimer(ITIMER_VIRTUAL, &timer))
printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
@@ -375,11 +376,12 @@
/* Now we set local_using_sysemu to be used for one loop */
local_using_sysemu = get_using_sysemu();
+ local_using_sysptvm = get_using_sysptvm();
op = SELECT_PTRACE_OPERATION(local_using_sysemu,
singlestepping(NULL));
- if (ptrace(op, pid, 0, 0)) {
+ if (ptrace(op, pid, local_using_sysptvm, 0)) {
printk(UM_KERN_ERR "userspace - ptrace continue "
"failed, op = %d, errno = %d\n", op, errno);
fatal_sigsegv();
diff -Naur linux-2.6.29-vm/arch/um/os-Linux/start_up.c linux-2.6.29-vm2/arch/um/os-Linux/start_up.c
--- linux-2.6.29-vm/arch/um/os-Linux/start_up.c 2009-03-24 22:04:09.000000000 +0100
+++ linux-2.6.29-vm2/arch/um/os-Linux/start_up.c 2009-03-24 22:12:50.000000000 +0100
@@ -198,6 +198,34 @@
" See http://perso.wanadoo.fr/laurent.vivier/UML/ for further \n"
" information.\n\n");
+/* Changed only during early boot */
+static int force_sysptvm_disabled;
+
+static int __init nosysptvm_cmd_param(char *str, int* add)
+{
+ force_sysptvm_disabled = 1;
+ return 0;
+}
+
+__uml_setup("nosysptvm", nosysptvm_cmd_param,
+"nosysptvm\n"
+" Turns off syscall emulation tags for ptrace (ptrace_vm) on.\n"
+" Ptrace_vm is a feature introduced by Renzo Davoli. It changes\n"
+" behaviour of ptrace() and helps reducing host context switch rate.\n\n");
+
+static int use_sysemu;
+
+static int __init usesysemu_cmd_param(char *str, int* add)
+{
+ use_sysemu = 1;
+ return 0;
+}
+
+__uml_setup("usesysemu", usesysemu_cmd_param,
+"usesysemu\n"
+" Use sysemu instead of sysptvm even when the kernel supports it.\n\n"
+);
+
static void __init check_sysemu(void)
{
unsigned long regs[MAX_REG_NR];
@@ -293,6 +321,114 @@
non_fatal("missing\n");
}
+/*
+ * test thread code. This thread is started only to test
+ * which features are provided by the linux kernel
+ */
+static int sysptvm_child(void *arg)
+{
+ int *featurep = arg;
+ int p[2] = {-1, -1};
+ pid_t pid = os_getpid();
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) {
+ perror("ptrace test_ptracemulti");
+ kill(pid, SIGKILL);
+ }
+ kill(pid, SIGSTOP);
+ *featurep = 0;
+ os_getpid();
+ /*
+ * if it reaches this point in 1 stop it means that
+ * PTRACE_SYSCALL_SKIPEXIT works
+ */
+ *featurep = PTRACE_SYSCALL_SKIPEXIT;
+ pipe(p);
+ /*
+ * if after a PTRACE_SYSCALL_SKIPCALL p[0] is already <0
+ * pipe has been really skipped
+ */
+ if (p[0] < 0)
+ *featurep = PTRACE_SYSCALL_SKIPCALL;
+ else { /* clean up everything */
+ close(p[0]);
+ close(p[1]);
+ }
+ return 0;
+}
+
+/*
+ * kernel feature test:
+ * it returns:
+ * -1 error
+ * 0 old PTRACE_SYSCALL (addr is ignored)
+ * PTRACE_SYSCALL_SKIPEXIT: just skip_exit is provided
+ * PTRACE_SYSCALL_SKIPCALL: the entire syntax is implemented
+ * by the running kernel
+ */
+static int __init test_ptrace_sysptvm(void)
+{
+ int pid, status, rv, feature;
+ static char stack[1024];
+ feature = 0;
+
+ pid = clone(sysptvm_child, &stack[1020], SIGCHLD | CLONE_VM, &feature);
+ if (pid < 0)
+ return 0;
+ if (waitpid(pid, &status, WUNTRACED) < 0) {
+ kill(pid, SIGKILL);
+ return 0;
+ }
+ /* restart and wait for the next syscall (getpid)*/
+ rv = ptrace(PTRACE_SYSCALL, pid, 0, 0);
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /* try to skip the exit call */
+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPEXIT, 0);
+ if (rv < 0)
+ goto out;
+ /* wait for the next stop */
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ goto out;
+ /*
+ * if feature is already 0 it means that this is the exit call,
+ * and it has not been skipped, otherwise this is the
+ * entry call for the system call "time"
+ */
+ if (feature < PTRACE_SYSCALL_SKIPEXIT)
+ goto out;
+ /* restart (time) and and try to skip the entire call */
+ rv = ptrace(PTRACE_SYSCALL, pid, PTRACE_SYSCALL_SKIPCALL, 0);
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+out:
+ ptrace(PTRACE_KILL, pid, 0, 0);
+ /* eliminate zombie */
+ if (waitpid(pid, &status, WUNTRACED) < 0)
+ return 0;
+ return feature;
+}
+
+static int __init check_sysptvm(void)
+{
+ int feature = test_ptrace_sysptvm();
+
+ non_fatal("Checking ptrace new tags for syscall emulation...");
+ if (feature == PTRACE_SYSCALL_SKIPCALL) {
+ sysptvm_supported = 1;
+ non_fatal("OK");
+ if (!force_sysptvm_disabled) {
+ set_using_sysptvm(PTRACE_SYSCALL_SKIPCALL);
+ non_fatal("\n");
+ return 1;
+ } else {
+ non_fatal(" (disabled)\n");
+ return 0;
+ }
+ } else
+ non_fatal("unsupported\n");
+ return 0;
+}
+
static void __init check_ptrace(void)
{
int pid, syscall, n, status;
@@ -330,7 +466,8 @@
}
stop_ptraced_child(pid, 0, 1);
non_fatal("OK\n");
- check_sysemu();
+ if (use_sysemu || !check_sysptvm())
+ check_sysemu();
}
extern void check_tmpexec(void);
On Wed, Mar 25, 2009 at 12:20:19AM +0100, Renzo Davoli wrote:
>Patch rebased on 2.6.29. I have fixed the code following Cong's suggestion.
> renzo
>Although get_using_sysptvm is used as a boolean, I have left it int just
>for the sake of simmetry with get_using_sysemu.
>It could be safely changed to boolean at any time.
>
Thanks.
My point is *not* changing it from 'int' to 'bool', I mean you
should change the interface, for example, change get_using_sysptvm()
to enable_sysptvm().