[v2 for style, CREDITS file is deprecated]
HPA is already on record calling for an execveat() which also does
fexecve()'s job: https://lkml.org/lkml/2006/7/11/556.
And the current glibc hack for fexecve() is already causing problems
in the wild. Eg: https://bugzilla.redhat.com/show_bug.cgi?id=241609,
https://lkml.org/lkml/2006/12/27/123, and as recounted at
http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043.
So here's an attempt at just that:
----------------------------------------------------------------------
This patch adds a new system call, execveat(2). execveat() is to
execve() as openat() is to open(): it takes a file descriptor that
refers to a directory, and resolves the filename relative to that.
In addition, if the filename is NULL, execveat() executes the file
to which the file descriptor refers. This replicates the functionality
of fexecve(), which is a system call in other UNIXen, but in Linux
glibc v2.16 it's a gross hack that depends on /proc being mounted.
That hack does not work in chrooted sandboxes, or stripped-down
systems without /proc mounted. execveat() does.
Only x86-64 and i386 ABIs are supported in this patch. User-Mode Linux
is also supported.
Signed-off-by: Meredydd Luff <[email protected]>
---
arch/alpha/kernel/binfmt_loader.c | 2 +-
arch/um/kernel/exec.c | 36 ++++++++++++++++----
arch/x86/ia32/ia32entry.S | 1 +
arch/x86/ia32/sys_ia32.c | 20 +++++++++++
arch/x86/include/asm/sys_ia32.h | 3 ++
arch/x86/kernel/entry_32.S | 17 ++++++++++
arch/x86/kernel/entry_64.S | 15 ++++++++
arch/x86/kernel/process.c | 27 +++++++++++++++
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
arch/x86/um/sys_call_table_32.c | 1 +
arch/x86/um/sys_call_table_64.c | 1 +
fs/binfmt_elf.c | 2 +-
fs/binfmt_elf_fdpic.c | 2 +-
fs/binfmt_em86.c | 2 +-
fs/binfmt_flat.c | 2 +-
fs/binfmt_misc.c | 2 +-
fs/binfmt_script.c | 2 +-
fs/exec.c | 65 ++++++++++++++++++++++++++++++------
include/asm-generic/syscalls.h | 7 ++++
include/linux/compat.h | 2 +
include/linux/fs.h | 2 +-
include/linux/sched.h | 3 ++
23 files changed, 190 insertions(+), 26 deletions(-)
diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c
index d1f474d..7968491 100644
--- a/arch/alpha/kernel/binfmt_loader.c
+++ b/arch/alpha/kernel/binfmt_loader.c
@@ -24,7 +24,7 @@ static int load_binary(struct linux_binprm *bprm, struct pt_regs *regs)
loader = bprm->vma->vm_end - sizeof(void *);
- file = open_exec("/sbin/loader");
+ file = open_exec(AT_FDCWD, "/sbin/loader");
retval = PTR_ERR(file);
if (IS_ERR(file))
return retval;
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 6cade93..cd7b2a9 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -5,6 +5,7 @@
#include <linux/stddef.h>
#include <linux/module.h>
+#include <linux/fcntl.h>
#include <linux/fs.h>
#include <linux/ptrace.h>
#include <linux/sched.h>
@@ -44,13 +45,13 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
}
EXPORT_SYMBOL(start_thread);
-static long execve1(const char *file,
- const char __user *const __user *argv,
- const char __user *const __user *env)
+static long execveat1(int dfd, const char *file,
+ const char __user *const __user *argv,
+ const char __user *const __user *env)
{
long error;
- error = do_execve(file, argv, env, ¤t->thread.regs);
+ error = do_execveat(dfd, file, argv, env, ¤t->thread.regs);
if (error == 0) {
task_lock(current);
current->ptrace &= ~PT_DTRACE;
@@ -66,7 +67,7 @@ long um_execve(const char *file, const char __user *const __user *argv, const ch
{
long err;
- err = execve1(file, argv, env);
+ err = execveat1(AT_FDCWD, file, argv, env);
if (!err)
UML_LONGJMP(current->thread.exec_buf, 1);
return err;
@@ -80,9 +81,30 @@ long sys_execve(const char __user *file, const char __user *const __user *argv,
filename = getname(file);
error = PTR_ERR(filename);
- if (IS_ERR(filename)) goto out;
- error = execve1(filename, argv, env);
+ if (IS_ERR(filename))
+ goto out;
+ error = execveat1(AT_FDCWD, filename, argv, env);
putname(filename);
out:
return error;
}
+
+long sys_execveat(int dfd, const char __user *file,
+ const char __user *const __user *argv,
+ const char __user *const __user *env)
+{
+ long error;
+ char *filename = NULL;
+
+ if (file) {
+ filename = getname(file);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ goto out;
+ }
+ error = execveat1(dfd, filename, argv, env);
+ if (filename)
+ putname(filename);
+ out:
+ return error;
+}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20e5f7b..fd71cd7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -460,6 +460,7 @@ GLOBAL(\label)
PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
PTREGSCALL stub32_execve, sys32_execve, %rcx
+ PTREGSCALL stub32_execveat, sys32_execveat, %r8
PTREGSCALL stub32_fork, sys_fork, %rdi
PTREGSCALL stub32_clone, sys32_clone, %rdx
PTREGSCALL stub32_vfork, sys_vfork, %rdi
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 4540bec..e88398a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -400,6 +400,26 @@ asmlinkage long sys32_execve(const char __user *name, compat_uptr_t __user *argv
return error;
}
+asmlinkage long sys32_execveat(int dfd, const char __user *name,
+ compat_uptr_t __user *argv,
+ compat_uptr_t __user *envp,
+ struct pt_regs *regs)
+{
+ long error;
+ char *filename = NULL;
+
+ if (name) {
+ filename = getname(name);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ return error;
+ }
+ error = compat_do_execveat(dfd, filename, argv, envp, regs);
+ if (filename)
+ putname(filename);
+ return error;
+}
+
asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
struct pt_regs *regs)
{
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 3fda9db4..59629cf 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -56,6 +56,9 @@ asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
asmlinkage long sys32_execve(const char __user *, compat_uptr_t __user *,
compat_uptr_t __user *, struct pt_regs *);
+asmlinkage long sys32_execveat(int dfd, const char __user *,
+ compat_uptr_t __user *, compat_uptr_t __user *,
+ struct pt_regs *);
asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
long sys32_lseek(unsigned int, int, unsigned int);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 623f288..771f19a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -724,10 +724,27 @@ ENTRY(ptregs_##name) ; \
CFI_ENDPROC; \
ENDPROC(ptregs_##name)
+#define PTREGSCALL4(name) \
+ENTRY(ptregs_##name) ; \
+ CFI_STARTPROC; \
+ leal 4(%esp),%eax; \
+ pushl_cfi %eax; \
+ pushl_cfi PT_ESI(%eax); \
+ movl PT_EDX(%eax),%ecx; \
+ movl PT_ECX(%eax),%edx; \
+ movl PT_EBX(%eax),%eax; \
+ call sys_##name; \
+ addl $8,%esp; \
+ CFI_ADJUST_CFA_OFFSET -8; \
+ ret; \
+ CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
+
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
PTREGSCALL3(execve)
+PTREGSCALL4(execveat)
PTREGSCALL2(sigaltstack)
PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7d65133..597c356 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -766,6 +766,21 @@ ENTRY(stub_execve)
CFI_ENDPROC
END(stub_execve)
+ENTRY(stub_execveat)
+ CFI_STARTPROC
+ addq $8, %rsp
+ PARTIAL_FRAME 0
+ SAVE_REST
+ FIXUP_TOP_OF_STACK %r11
+ movq %rsp, %r8
+ call sys_execveat
+ RESTORE_TOP_OF_STACK %r11
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(stub_execveat)
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 735279e..5fa3c0a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -363,6 +363,33 @@ long sys_execve(const char __user *name,
return error;
}
+long sys_execveat(int dfd, const char __user *name,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp, struct pt_regs *regs)
+{
+ long error;
+ char *filename = NULL;
+
+ if (name) {
+ filename = getname(name);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ return error;
+ }
+ error = do_execveat(dfd, filename, argv, envp, regs);
+
+#ifdef CONFIG_X86_32
+ if (error == 0) {
+ /* Make sure we don't return using sysenter.. */
+ set_thread_flag(TIF_IRET);
+ }
+#endif
+
+ if (filename)
+ putname(filename);
+ return error;
+}
+
/*
* Idle related variables and functions
*/
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 7a35a6e..a6a2788 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -356,3 +356,4 @@
347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv
348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev
349 i386 kcmp sys_kcmp
+350 i386 execveat ptregs_execveat stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 51171ae..c35df9e 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -319,6 +319,7 @@
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
312 64 kcmp sys_kcmp
+313 64 execveat stub_execveat
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 68d1dc9..05ec219 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -26,6 +26,7 @@
#define ptregs_fork sys_fork
#define ptregs_execve sys_execve
+#define ptregs_execveat sys_execveat
#define ptregs_iopl sys_iopl
#define ptregs_vm86old sys_vm86old
#define ptregs_clone sys_clone
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 170bd92..68dbda7 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
#define stub_fork sys_fork
#define stub_vfork sys_vfork
#define stub_execve sys_execve
+#define stub_execveat sys_execveat
#define stub_sigaltstack sys_sigaltstack
#define stub_rt_sigreturn sys_rt_sigreturn
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956..f165c1d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -653,7 +653,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
goto out_free_interp;
- interpreter = open_exec(elf_interpreter);
+ interpreter = open_exec(AT_FDCWD, elf_interpreter);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter))
goto out_free_interp;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3d77cf8..6923adf 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -235,7 +235,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
kdebug("Using ELF interpreter %s", interpreter_name);
/* replace the program with the interpreter */
- interpreter = open_exec(interpreter_name);
+ interpreter = open_exec(AT_FDCWD, interpreter_name);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter)) {
interpreter = NULL;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e..7a1f312 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -80,7 +80,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
* Note that we use open_exec() as the name is now in kernel
* space, and we don't need to copy it.
*/
- file = open_exec(interp);
+ file = open_exec(AT_FDCWD, interp);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 178cb70..f53b9e2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,7 +820,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
/* Open the file up */
bprm.filename = buf;
- bprm.file = open_exec(bprm.filename);
+ bprm.file = open_exec(AT_FDCWD, bprm.filename);
res = PTR_ERR(bprm.file);
if (IS_ERR(bprm.file))
return res;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cd..330e9ec 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -178,7 +178,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
bprm->interp = iname; /* for binfmt_script */
- interp_file = open_exec (iname);
+ interp_file = open_exec(AT_FDCWD, iname);
retval = PTR_ERR (interp_file);
if (IS_ERR (interp_file))
goto _error;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f..09a056b 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -87,7 +87,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
/*
* OK, now restart the process with the interpreter's dentry.
*/
- file = open_exec(interp);
+ file = open_exec(AT_FDCWD, interp);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/fs/exec.c b/fs/exec.c
index da27b91..48a3ba7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -758,7 +758,7 @@ EXPORT_SYMBOL(setup_arg_pages);
#endif /* CONFIG_MMU */
-struct file *open_exec(const char *name)
+struct file *open_exec(int dfd, const char *name)
{
struct file *file;
int err;
@@ -768,9 +768,22 @@ struct file *open_exec(const char *name)
.intent = LOOKUP_OPEN
};
- file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
- if (IS_ERR(file))
- goto out;
+ if (name) {
+ file = do_filp_open(dfd, name, &open_exec_flags,
+ LOOKUP_FOLLOW);
+ if (IS_ERR(file))
+ goto out;
+ } else {
+ file = fget(dfd);
+ err = -EBADF;
+ if (!file)
+ goto exit_nofile;
+
+ err = inode_permission(file->f_path.dentry->d_inode,
+ open_exec_flags.acc_mode);
+ if (err)
+ goto exit;
+ }
err = -EACCES;
if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
@@ -779,7 +792,8 @@ struct file *open_exec(const char *name)
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- fsnotify_open(file);
+ if (name)
+ fsnotify_open(file);
err = deny_write_access(file);
if (err)
@@ -790,6 +804,7 @@ out:
exit:
fput(file);
+exit_nofile:
return ERR_PTR(err);
}
EXPORT_SYMBOL(open_exec);
@@ -1463,7 +1478,8 @@ EXPORT_SYMBOL(search_binary_handler);
/*
* sys_execve() executes a new program.
*/
-static int do_execve_common(const char *filename,
+static int do_execveat_common(int dfd,
+ const char *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
struct pt_regs *regs)
@@ -1510,7 +1526,7 @@ static int do_execve_common(const char *filename,
clear_in_exec = retval;
current->in_execve = 1;
- file = open_exec(filename);
+ file = open_exec(dfd, filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1518,8 +1534,9 @@ static int do_execve_common(const char *filename,
sched_exec();
bprm->file = file;
- bprm->filename = filename;
- bprm->interp = filename;
+ bprm->filename = filename ?:
+ (const char *) file->f_path.dentry->d_name.name;
+ bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
if (retval)
@@ -1597,7 +1614,17 @@ int do_execve(const char *filename,
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execve_common(filename, argv, envp, regs);
+ return do_execveat_common(AT_FDCWD, filename, argv, envp, regs);
+}
+
+int do_execveat(int dfd, const char *filename,
+ const char __user *const __user *__argv,
+ const char __user *const __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = { .ptr.native = __argv };
+ struct user_arg_ptr envp = { .ptr.native = __envp };
+ return do_execveat_common(dfd, filename, argv, envp, regs);
}
#ifdef CONFIG_COMPAT
@@ -1614,7 +1641,23 @@ int compat_do_execve(char *filename,
.is_compat = true,
.ptr.compat = __envp,
};
- return do_execve_common(filename, argv, envp, regs);
+ return do_execveat_common(AT_FDCWD, filename, argv, envp, regs);
+}
+
+int compat_do_execveat(int dfd, char *filename,
+ compat_uptr_t __user *__argv,
+ compat_uptr_t __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = {
+ .is_compat = true,
+ .ptr.compat = __argv,
+ };
+ struct user_arg_ptr envp = {
+ .is_compat = true,
+ .ptr.compat = __envp,
+ };
+ return do_execveat_common(dfd, filename, argv, envp, regs);
}
#endif
diff --git a/include/asm-generic/syscalls.h b/include/asm-generic/syscalls.h
index d89dec8..ded3be3 100644
--- a/include/asm-generic/syscalls.h
+++ b/include/asm-generic/syscalls.h
@@ -29,6 +29,13 @@ asmlinkage long sys_execve(const char __user *filename,
struct pt_regs *regs);
#endif
+#ifndef sys_execveat
+asmlinkage long sys_execveat(int dfd, const char __user *filename,
+ const char __user *const __user *argv,
+ const char __user *const __user *envp,
+ struct pt_regs *regs);
+#endif
+
#ifndef sys_mmap2
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 4e89039..25ab799 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -291,6 +291,8 @@ asmlinkage ssize_t compat_sys_pwritev(unsigned long fd,
int compat_do_execve(char *filename, compat_uptr_t __user *argv,
compat_uptr_t __user *envp, struct pt_regs *regs);
+int compat_do_execveat(int dfd, char *filename, compat_uptr_t __user *argv,
+ compat_uptr_t __user *envp, struct pt_regs *regs);
asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 17fd887..6c40dc7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2316,7 +2316,7 @@ extern struct file *create_write_pipe(int flags);
extern void free_write_pipe(struct file *);
extern int kernel_read(struct file *, loff_t, char *, unsigned long);
-extern struct file * open_exec(const char *);
+extern struct file *open_exec(int dfd, const char *);
/* fs/dcache.c -- generic fs support functions */
extern int is_subdir(struct dentry *, struct dentry *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a1f493..2720f83 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2310,6 +2310,9 @@ extern int disallow_signal(int);
extern int do_execve(const char *,
const char __user * const __user *,
const char __user * const __user *, struct pt_regs *);
+extern int do_execveat(int dfd, const char *,
+ const char __user * const __user *,
+ const char __user * const __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
--
1.7.4.1
On 08/01/2012 03:10 PM, Meredydd Luff wrote:
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index 51171ae..c35df9e 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -319,6 +319,7 @@
> 310 64 process_vm_readv sys_process_vm_readv
> 311 64 process_vm_writev sys_process_vm_writev
> 312 64 kcmp sys_kcmp
> +313 64 execveat stub_execveat
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
I think that should be common, not 64 (as should kcmp be).
-hpa
On Wed, Aug 1, 2012 at 11:53 PM, H. Peter Anvin <[email protected]> wrote:
> On 08/01/2012 03:10 PM, Meredydd Luff wrote:
>> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
>> index 51171ae..c35df9e 100644
>> --- a/arch/x86/syscalls/syscall_64.tbl
>> +++ b/arch/x86/syscalls/syscall_64.tbl
>> @@ -319,6 +319,7 @@
>> 310 64 process_vm_readv sys_process_vm_readv
>> 311 64 process_vm_writev sys_process_vm_writev
>> 312 64 kcmp sys_kcmp
>> +313 64 execveat stub_execveat
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>
> I think that should be common, not 64 (as should kcmp be).
I copied the original execve, which is 64.
Meredydd
On 08/01/2012 04:09 PM, Meredydd Luff wrote:
>>> #
>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>
>> I think that should be common, not 64 (as should kcmp be).
>
> I copied the original execve, which is 64.
>
Sorry, you're right. The argument vector needs compatibility support.
This means you need an x32 version of the function -- execve
unfortunately is one of the few system calls which require a special x32
version (although it's a simple wrapper around sys32_execve). See
sys_x32_execve.
-hpa
On 08/01/2012 04:30 PM, H. Peter Anvin wrote:
> On 08/01/2012 04:09 PM, Meredydd Luff wrote:
>>>> #
>>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>>
>>> I think that should be common, not 64 (as should kcmp be).
>>
>> I copied the original execve, which is 64.
>>
>
> Sorry, you're right. The argument vector needs compatibility support.
>
> This means you need an x32 version of the function -- execve
> unfortunately is one of the few system calls which require a special x32
> version (although it's a simple wrapper around sys32_execve). See
> sys_x32_execve.
>
Err, stub_x32_execve.
-hpa
Sorry misunderstood your offlist mail I can cook up an x32 binary easily enough.
Meredydd Luff <[email protected]> wrote:
>On 2 Aug 2012 00:33, "H. Peter Anvin" <[email protected]> wrote:
>> > Sorry, you're right. The argument vector needs compatibility
>support.
>> >
>> > This means you need an x32 version of the function
>
>This is why I originally asked you (off-list) whether it was okay to
>omit
>x32 support in the initial patch :)
>
>I have x32 code already, but I don't have the toolchain or glibc to
>test
>it. (Nor the pain tolerance to build the bleeding-edge versions on this
>laptop - unless I really, really need to...)
>
>If you (or someone else with a working x32 toolchain) could build a
>static
>binary from a tiny test program, I could test that code and put it back
>in
>the patch.
>
>Meredydd
--
Sent from my mobile phone. Please excuse brevity and lack of formatting.
On Wed, Aug 01, 2012 at 04:30:22PM -0700, H. Peter Anvin wrote:
> On 08/01/2012 04:09 PM, Meredydd Luff wrote:
> >>> #
> >>> # x32-specific system call numbers start at 512 to avoid cache impact
> >>
> >> I think that should be common, not 64 (as should kcmp be).
> >
> > I copied the original execve, which is 64.
> >
>
> Sorry, you're right. The argument vector needs compatibility support.
>
> This means you need an x32 version of the function -- execve
> unfortunately is one of the few system calls which require a special x32
> version (although it's a simple wrapper around sys32_execve). See
> sys_x32_execve.
I *really* strongly object to doing that thing before we sanitize the
situation with sys_execve(). As it is, the damn thing is defined
separately on each architecture, with spectaculary ugly kludges used
in these implementations. Adding a parallel pile of kludges (and
due to their nature, they'll need to be changed in non-trivial
way in a lot of cases) is simply wrong.
The thing is, there's essentially no reason to have more than one
implementation. What they are (badly) doing is "we need to find
pt_regs to pass to do_execve(), the thing we are after has to be near
our stack frame, so let's try to get to it that way". With really
ugly set of kludges trying to do just that.
What we should use instead is task_pt_regs(); maybe introduce
current_pt_regs(), defaulting to task_pt_regs(current) and letting
architectures that can do it better (on some it's simply
available in dedicated register, on some it's better to work
from current_thread_info(), etc.) override the default.
With that we have a fairly good chance to merge most of those
guys; probably not all of them, due to e.g. mips weirdness,
but enough to make it worth doing.
The obstacle is in lazy kernel_execve() implementations;
ones that simply issue a trap/whatever is used to enter
the system call. Directly from kernel space. It doesn't
have to be done that way; see what e.g. arm does there.
Note that doing it without syscall instruction avoids another
headache; namely, we don't have to worry about returning
from *failed* execve (i.e. return to kernel mode) through
the codepath that is normally taken only when returning
to userland.
FWIW, I would try to pull the asm tail of arm kernel_execve()
into something that would look to C side as
ret_from_kernel_exec(®s); /* never returns */
and start converting architectures to that primitive. It should
copy the provided pt_regs to normal location (keeping in mind
that there really might be an overlap), set registers (including
stack pointer) for normal return to user path and jump there.
Essentially, that's the real arch-dependent part of kernel_execve() -
transition from kernel thread to userland process.
It can be done architecture-by-architecture; there's no need to make
it a flagday conversion. Once an arch is handled, we define
something like __ARCH_HAS_RET_FROM_KERNEL_EXEC and get the common
implementations of kernel_execve() and sys_execve() for that -
those could simply live in fs/exec.c under the matching ifdef.
Along with your sys_execveat(). I can probably throw alpha,
arm and x86 conversions into the pile, but it really needs to
be handled on linux-arch, with arch maintainers at least agreeing
in principle with that scheme.
On Thu, Aug 2, 2012 at 7:55 AM, Al Viro <[email protected]> wrote:
>> This means you need an x32 version of the function -- execve
>> unfortunately is one of the few system calls which require a special x32
>> version (although it's a simple wrapper around sys32_execve). See
>> sys_x32_execve.
>
> I *really* strongly object to doing that thing before we sanitize the
> situation with sys_execve().
"That thing" = "creating an x32 entry stub", or "merging execveat() at all"?
(snip)
> The thing is, there's essentially no reason to have more than one
> implementation. What they are (badly) doing is "we need to find
> pt_regs to pass to do_execve(), the thing we are after has to be near
> our stack frame, so let's try to get to it that way".
Hang on...it's not just sys_execve that fits that description, is it?
You seem to be describing every call that needs a pt_regs parameter,
which at a glance is anything with a stub_ or PTREGSCALL in
arch/x86/kernel/entry_{32,64}.S. That's: clone, fork, vfork,
sigaltstack, iopl, execve, sigreturn, rt_sigreturn, vm86, vm86old.
Most of those are handled by a common PTREGSCALL macro, but there are
a few that get special treatment (different set on each arch - on
x86-64 it's execve and rt_sigreturn ; on i386 it's just clone).
Is there's something special about execve in particular, or do you
want to overhaul all the ptregscalls?
Meredydd
On Thu, Aug 02, 2012 at 10:14:53AM +0100, Meredydd Luff wrote:
> On Thu, Aug 2, 2012 at 7:55 AM, Al Viro <[email protected]> wrote:
> >> This means you need an x32 version of the function -- execve
> >> unfortunately is one of the few system calls which require a special x32
> >> version (although it's a simple wrapper around sys32_execve). See
> >> sys_x32_execve.
> >
> > I *really* strongly object to doing that thing before we sanitize the
> > situation with sys_execve().
>
> "That thing" = "creating an x32 entry stub", or "merging execveat() at all"?
>
> (snip)
> > The thing is, there's essentially no reason to have more than one
> > implementation. What they are (badly) doing is "we need to find
> > pt_regs to pass to do_execve(), the thing we are after has to be near
> > our stack frame, so let's try to get to it that way".
>
> Hang on...it's not just sys_execve that fits that description, is it?
> You seem to be describing every call that needs a pt_regs parameter,
> which at a glance is anything with a stub_ or PTREGSCALL in
> arch/x86/kernel/entry_{32,64}.S. That's: clone, fork, vfork,
> sigaltstack, iopl, execve, sigreturn, rt_sigreturn, vm86, vm86old.
> Most of those are handled by a common PTREGSCALL macro, but there are
> a few that get special treatment (different set on each arch - on
> x86-64 it's execve and rt_sigreturn ; on i386 it's just clone).
>
> Is there's something special about execve in particular, or do you
> want to overhaul all the ptregscalls?
You are looking at that from the wrong side; it's not that this
set of syscalls on x86 has the same kind of wrapper - it's that
on different architectures the kludges are seriously different
and fairly brittle. sigaltstack(), BTW, doesn't need to be
arch-specific at all - if you check what its pt_regs argument
is used for, we just need something like current_user_stack_pointer()
and that's it. It's a different patch series, anyway.
There's a difference between "here's the syscall implementation,
here's its hookup for x86, let other architectures update their
syscall tables" and "here's the implementation in arch/x86 and its
hookup for x86; let other architectures port it in whatever way
they need". The latter is a recipe for breakage - we'd been
through that kind of story quite a few times.
Out of that set, vm86/vm86old/iopl are genuine x86-only syscalls.
sigreturn and rt_sigreturn are weird, subtle and, sadly,
unmergable - syscall restart prevention logics is different
enough to make it not feasible at the moment. It's a serious
source of PITA, BTW - quite a few of those had the same bugs,
years after they'd been discovered and fixed on a subset of
architectures. Been there, fixed some of those, the latest
batch this April... fork/vfork/clone is an interesting
question - IIRC, there are seriously subtle issues of some
sort on sparc, but I don't remember details right now.
Really, go and grep for do_execve; most of the callers will
be in execve(2) implementations. Look at them carefully -
while some get pt_regs from some wrapper, there's a bunch
that does that manually in C. "Ugly" doesn't even begin
to describe what's being done...
FWIW, I've just pushed (completely untested) arm and alpha
parts of what I described into signal.git#execve2; x86 is
next. Note that after that sys_execve() is identical on
converted architectures and can be merged; ditto for
kernel_execve(). After I do x86 counterpart, I'll
take those guys to fs/exec.c under ifdef for new __ARCH_HAS_...
(and define it on already converted ones, obviously).
Then your patch goes there, except that implementation
gets put into fs/exec.c, under the same ifdef. And with
current_pt_regs() used instead of the extra argument,
of course. From that point on it can be used on any converted
architecture. And conversion consists of
* providing ret_from_kernel_execve() that would
put pt_regs where they belong and return to userland.
* providing current_pt_regs(), with default being
just task_pt_regs(current).
* defining that new __ARCH_HAS_...
* removing sys_execve()/kernel_execve() implementations;
the ones in fs/exec.c will work just fine.
Can be done at leisure, architecture by architecture. It's
obviously the next cycle fodder - we have only a couple of
days left of this merge window, anyway.
I really think that this pair of primitives is the right
way to deal with execve mess.
On Thu, Aug 2, 2012 at 11:30 AM, Al Viro <[email protected]> wrote:
> FWIW, I've just pushed (completely untested) arm and alpha
> parts of what I described into signal.git#execve2; x86 is
> next. Note that after that sys_execve() is identical on
> converted architectures and can be merged; ditto for
> kernel_execve(). After I do x86 counterpart, I'll
> take those guys to fs/exec.c under ifdef for new __ARCH_HAS_...
> (and define it on already converted ones, obviously).
> Then your patch goes there, except that implementation
> gets put into fs/exec.c, under the same ifdef. And with
> current_pt_regs() used instead of the extra argument,
> of course. From that point on it can be used on any converted
> architecture.
OK, that makes sense to me.
What would you need from me, and when? Should I just wait for your
#ifdef-ed sys_execve() in fs/exec.c, and re-spin the patch based on
that?
Is there anything else I can/should do in the meantime to make this
patch acceptable?
Meredydd
On 08/01/2012 03:10 PM, Meredydd Luff wrote:
>
> +#ifndef sys_execveat
> +asmlinkage long sys_execveat(int dfd, const char __user *filename,
> + const char __user *const __user *argv,
> + const char __user *const __user *envp,
> + struct pt_regs *regs);
> +#endif
Should this have a flags argument for future extension?
--Andy
On 08/09/2012 12:19 PM, Andy Lutomirski wrote:
> On 08/01/2012 03:10 PM, Meredydd Luff wrote:
>>
>> +#ifndef sys_execveat
>> +asmlinkage long sys_execveat(int dfd, const char __user *filename,
>> + const char __user *const __user *argv,
>> + const char __user *const __user *envp,
>> + struct pt_regs *regs);
>> +#endif
>
> Should this have a flags argument for future extension?
>
Not just for future extension. We already have a NOFOLLOW flag for the
-at functions, and that would be applicable here. CLOEXEC, obviously not ;)
-hpa