Hello,
This patchset tries to cleanup init/initramfs code especially for syscall
invocation which produces many warnings from sparse because of address
space change. One possible solution would be eliminating such calls at all
and use internal kernel functions directly. But Al Viro mentions there's
a historical(?) reason not to do so. [1]
First two of this patchset wrap all of syscall invocations with kern_sys_*()
helper functions which does nasty address space conversions for you. This
idea was suggested by Arnd Bergmann. Last one tries to implement above idea
- calling internel functions directly - in favour of kernel config option
even though I'm not sure this is right thing. :-(
This patchset depends on my previous patch "init: mark __user address space
on string literals" [2] now contained in -mm tree.
Any comments would be welcomed.
Thanks.
[1] http://lkml.org/lkml/2010/8/20/202
[2] http://lkml.org/lkml/2010/8/18/157
---
Namhyung Kim (3):
init: add sys-wrapper.h
initramfs: use kern_sys_* macros instead of syscall
init: introduce CONFIG_USE_INIT_SYSCALL_AS_KERNEL_ROUTINE
init/Makefile | 2 +
init/sys-wrapper.c | 589 ++++++++++++++++++++++++++++++++++++++++++++++++++++
init/sys-wrapper.h | 305 +++++++++++++++++++++++++++
usr/Kconfig | 7 +
4 files changed, 903 insertions(+), 0 deletions(-)
create mode 100644 init/sys-wrapper.c
create mode 100644 init/sys-wrapper.h
--
1.7.2.2
sys-wrapper.h contains wrapper functions for various syscalls used in init
code. This wrappers handle proper address space conversion so that it can
remove a lot of warnings from sparse.
Signed-off-by: Namhyung Kim <[email protected]>
---
init/sys-wrapper.h | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 246 insertions(+), 0 deletions(-)
create mode 100644 init/sys-wrapper.h
diff --git a/init/sys-wrapper.h b/init/sys-wrapper.h
new file mode 100644
index 0000000..e4227f9
--- /dev/null
+++ b/init/sys-wrapper.h
@@ -0,0 +1,246 @@
+/*
+ * init/sys-wrapper.h
+ *
+ * Copyright (C) 2010 Namhyung Kim <[email protected]>
+ *
+ * wrappers for various syscalls for use in the init code
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/dirent.h>
+#include <linux/syscalls.h>
+
+
+/* These macro are called just before/after actual syscalls. */
+#define KSYS_PREPARE \
+ mm_segment_t old_fs = get_fs(); \
+ set_fs(KERNEL_DS);
+
+#define KSYS_RESTORE \
+ set_fs(old_fs);
+
+
+static inline int kern_sys_link(const char *oldname, const char *newname)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_link((const char __user __force *) oldname,
+ (const char __user __force *) newname);
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_unlink(const char *pathname)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_unlink((const char __user __force *) pathname);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_newlstat(const char *filename,
+ struct stat *statbuf)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_newlstat((const char __user __force *) filename,
+ (struct stat __user __force *) statbuf);
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_mkdir(const char *pathname, int mode)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_mkdir((const char __user __force *) pathname, mode);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_rmdir(const char *pathname)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_rmdir((const char __user __force *) pathname);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_mknod(const char *filename, int mode, unsigned dev)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_mknod((const char __user __force *) filename, mode, dev);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_chown(const char *filename, uid_t user, gid_t group)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_chown((const char __user __force *) filename, user, group);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_chmod(const char *filename, mode_t mode)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_chmod((const char __user __force *) filename, mode);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_open(const char *filename, int flags, int mode)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_open((const char __user __force *) filename, flags, mode);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_fchown(unsigned int fd, uid_t user, gid_t group)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_fchown(fd, user, group);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_fchmod(unsigned int fd, mode_t mode)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_fchmod(fd, mode);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_ftruncate(unsigned int fd, unsigned long length)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_ftruncate(fd, length);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_read(unsigned int fd, char *buf, size_t count)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_read(fd, (char __user __force *) buf, count);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_write(unsigned int fd, const char *buf,
+ size_t count)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_write(fd, (const char __user __force *) buf, count);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_close(unsigned int fd)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_close(fd);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_symlink(const char *oldname, const char *newname)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_symlink((const char __user __force *) oldname,
+ (const char __user __force *) newname);
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_lchown(const char *filename, uid_t user,
+ gid_t group)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_lchown((const char __user __force *) filename, user, group);
+
+ KSYS_RESTORE;
+ return ret;
+}
+
+static inline int kern_sys_getdents64(unsigned int fd,
+ struct linux_dirent64 *dirent,
+ unsigned int count)
+{
+ int ret;
+ KSYS_PREPARE;
+
+ ret = sys_getdents64(fd,
+ (struct linux_dirent64 __user __force *) dirent,
+ count);
+ KSYS_RESTORE;
+ return ret;
+}
+
+
+#undef KSYS_PREPARE
+#undef KSYS_RESTORE
--
1.7.2.2
replace direct call to syscall routines to its wrapper functions
defined in init/sys-wrapper.h
Signed-off-by: Namhyung Kim <[email protected]>
---
init/sys-wrapper.c | 589 ++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 589 insertions(+), 0 deletions(-)
create mode 100644 init/sys-wrapper.c
diff --git a/init/sys-wrapper.c b/init/sys-wrapper.c
new file mode 100644
index 0000000..fa5949f
--- /dev/null
+++ b/init/sys-wrapper.c
@@ -0,0 +1,589 @@
+/*
+ * init/sys-wrapper.c
+ *
+ * Copyright (C) 2010 Namhyung Kim <[email protected]>
+ *
+ * Wrappers for various syscalls for use in the init code.
+ * Most of these functions are copied from their syscall implementation
+ * verbatim except that path lookup codes are changed to use kernel
+ * functions and security checks are removed.
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/fcntl.h>
+#include <linux/dirent.h>
+#include <linux/syscalls.h>
+#include <linux/highuid.h>
+#include "sys-wrapper.h"
+
+int __init kern_sys_link(const char *oldname, const char *newname)
+{
+ struct path old_path;
+ struct dentry *new_dentry;
+ struct nameidata nd;
+ int error;
+
+ error = kern_path(oldname, 0, &old_path);
+ if (error)
+ goto out;
+
+ error = path_lookup(newname, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out_path;
+
+ error = -EXDEV;
+ if (old_path.mnt != nd.path.mnt)
+ goto out_nd;
+
+ new_dentry = lookup_create(&nd, 0);
+ if (IS_ERR(new_dentry)) {
+ error = PTR_ERR(new_dentry);
+ goto out_unlock;
+ }
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+
+ error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(new_dentry);
+out_unlock:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+out_nd:
+ path_put(&nd.path);
+out_path:
+ path_put(&old_path);
+out:
+ return error;
+}
+
+static struct dentry *lookup_hash(struct nameidata *nd)
+{
+ int err;
+ struct dentry *base;
+ struct qstr *name;
+ struct inode *inode;
+ struct dentry *dentry;
+
+ base = nd->path.dentry;
+ name = &nd->last;
+ inode = base->d_inode;
+
+ if (inode->i_op->permission) {
+ err = inode->i_op->permission(inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_op && base->d_op->d_hash) {
+ err = base->d_op->d_hash(base, name);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ /*
+ * Don't bother with __d_lookup: callers are for creat as
+ * well as unlink, so a lot of the time it would cost
+ * a double lookup.
+ */
+ dentry = d_lookup(nd->path.dentry, &nd->last);
+
+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+ int status = dentry->d_op->d_revalidate(dentry, nd);
+ if (unlikely(status <= 0)) {
+ /*
+ * The dentry failed validation.
+ * If d_revalidate returned 0 attempt to invalidate
+ * the dentry otherwise d_revalidate is asking us
+ * to return a fail status.
+ */
+ if (!status) {
+ if (!d_invalidate(dentry)) {
+ dput(dentry);
+ dentry = NULL;
+ }
+ } else {
+ dput(dentry);
+ return ERR_PTR(status);
+ }
+ }
+ }
+
+ if (!dentry) {
+ struct dentry *old;
+ /* Don't create child dentry for a dead directory. */
+ if (unlikely(IS_DEADDIR(inode)))
+ return ERR_PTR(-ENOENT);
+
+ dentry = d_alloc(base, name);
+ if (unlikely(!dentry))
+ return ERR_PTR(-ENOMEM);
+
+ old = inode->i_op->lookup(inode, dentry, nd);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
+ }
+ }
+ return dentry;
+}
+
+int __init kern_sys_unlink(const char *pathname)
+{
+ int error;
+ struct dentry *dentry;
+ struct nameidata nd;
+ struct inode *inode = NULL;
+
+ error = path_lookup(pathname, LOOKUP_PARENT, &nd);
+ if (error)
+ return error;
+
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto out_path;
+
+ nd.flags &= ~LOOKUP_PARENT;
+
+ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_hash(&nd);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ /* Why not before? Because we want correct error value */
+ if (nd.last.name[nd.last.len])
+ goto slashes;
+ inode = dentry->d_inode;
+ if (inode)
+ atomic_inc(&inode->i_count);
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+ error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+
+ mnt_drop_write(nd.path.mnt);
+ out_dput:
+ dput(dentry);
+ }
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ if (inode)
+ iput(inode); /* truncate the inode here */
+out_path:
+ path_put(&nd.path);
+ return error;
+
+slashes:
+ error = !dentry->d_inode ? -ENOENT :
+ S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
+ goto out_dput;
+}
+
+/* This functions is taken from fs/stat.c */
+static int __init cp_new_stat(struct kstat *stat, struct stat *statbuf)
+{
+ struct stat tmp;
+
+#if BITS_PER_LONG == 32
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+#else
+ if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
+ return -EOVERFLOW;
+#endif
+
+ memset(&tmp, 0, sizeof(tmp));
+#if BITS_PER_LONG == 32
+ tmp.st_dev = old_encode_dev(stat->dev);
+#else
+ tmp.st_dev = new_encode_dev(stat->dev);
+#endif
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+ return -EOVERFLOW;
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
+ return -EOVERFLOW;
+ SET_UID(tmp.st_uid, stat->uid);
+ SET_GID(tmp.st_gid, stat->gid);
+#if BITS_PER_LONG == 32
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+#else
+ tmp.st_rdev = new_encode_dev(stat->rdev);
+#endif
+#if BITS_PER_LONG == 32
+ if (stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+#endif
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+#ifdef STAT_HAVE_NSEC
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+#endif
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+
+ memcpy(statbuf, &tmp, sizeof(tmp));
+ return 0;
+}
+
+int __init kern_sys_newlstat(const char *filename, struct stat *statbuf)
+{
+ int error;
+ struct path path;
+ struct kstat kstat;
+
+ error = kern_path(filename, 0, &path);
+ if (error)
+ return error;
+
+ error = vfs_getattr(path.mnt, path.dentry, &kstat);
+ if (error)
+ goto out;
+
+ cp_new_stat(&kstat, statbuf);
+out:
+ path_put(&path);
+ return error;
+}
+
+int __init kern_sys_mkdir(const char *pathname, int mode)
+{
+ int error;
+ struct dentry *dentry;
+ struct nameidata nd;
+
+ error = path_lookup(pathname, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out_err;
+
+ dentry = lookup_create(&nd, 1);
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+
+ if (!IS_POSIXACL(nd.path.dentry->d_inode))
+ mode &= ~current_umask();
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+
+ error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ path_put(&nd.path);
+out_err:
+ return error;
+}
+
+int __init kern_sys_rmdir(const char *pathname)
+{
+ int error;
+ struct dentry *dentry;
+ struct nameidata nd;
+
+ error = path_lookup(pathname, LOOKUP_PARENT, &nd);
+ if (error)
+ return error;
+
+ switch(nd.last_type) {
+ case LAST_DOTDOT:
+ error = -ENOTEMPTY;
+ goto exit1;
+ case LAST_DOT:
+ error = -EINVAL;
+ goto exit1;
+ case LAST_ROOT:
+ error = -EBUSY;
+ goto exit1;
+ }
+
+ nd.flags &= ~LOOKUP_PARENT;
+
+ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+
+ dentry = lookup_hash(&nd);
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto exit2;
+ }
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit3;
+
+ error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+
+ mnt_drop_write(nd.path.mnt);
+exit3:
+ dput(dentry);
+exit2:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+exit1:
+ path_put(&nd.path);
+ return error;
+}
+
+int __init kern_sys_mknod(const char *filename, int mode, unsigned dev)
+{
+ int error;
+ struct dentry *dentry;
+ struct nameidata nd;
+
+ if (S_ISDIR(mode))
+ return -EPERM;
+
+ error = path_lookup(filename, LOOKUP_PARENT, &nd);
+ if (error)
+ return error;
+
+ dentry = lookup_create(&nd, 0);
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+
+ if (!IS_POSIXACL(nd.path.dentry->d_inode))
+ mode &= ~current_umask();
+
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ case 0: /* zero mode translates to S_IFREG */
+ break;
+ case S_IFDIR:
+ error = -EPERM;
+ goto out_dput;
+ default:
+ error = -EINVAL;
+ goto out_dput;
+ }
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+
+ switch (mode & S_IFMT) {
+ case 0:
+ case S_IFREG:
+ error = vfs_create(nd.path.dentry->d_inode,dentry, mode, &nd);
+ break;
+
+ case S_IFCHR:
+ case S_IFBLK:
+ error = vfs_mknod(nd.path.dentry->d_inode,dentry, mode,
+ new_decode_dev(dev));
+ break;
+
+ case S_IFIFO:
+ case S_IFSOCK:
+ error = vfs_mknod(nd.path.dentry->d_inode,dentry, mode, 0);
+ break;
+ }
+
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ path_put(&nd.path);
+
+ return error;
+}
+
+static int __init chown_common(struct path *path, uid_t user, gid_t group)
+{
+ int error;
+ struct iattr newattrs;
+ struct inode *inode = path->dentry->d_inode;
+
+ newattrs.ia_valid = ATTR_CTIME;
+ if (user != (uid_t) -1) {
+ newattrs.ia_valid |= ATTR_UID;
+ newattrs.ia_uid = user;
+ }
+ if (group != (gid_t) -1) {
+ newattrs.ia_valid |= ATTR_GID;
+ newattrs.ia_gid = group;
+ }
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |=
+ ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+
+ mutex_lock(&inode->i_mutex);
+ error = notify_change(path->dentry, &newattrs);
+ mutex_unlock(&inode->i_mutex);
+
+ return error;
+}
+
+int __init kern_sys_chown(const char *filename, uid_t user, gid_t group)
+{
+ int error;
+ struct path path;
+
+ error = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (error)
+ goto out;
+
+ error = mnt_want_write(path.mnt);
+ if (error)
+ goto out_release;
+
+ error = chown_common(&path, user, group);
+
+ mnt_drop_write(path.mnt);
+out_release:
+ path_put(&path);
+out:
+ return error;
+}
+
+int __init kern_sys_chmod(const char *filename, mode_t mode)
+{
+ int error;
+ struct path path;
+ struct inode *inode;
+ struct iattr newattrs;
+
+ error = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (error)
+ goto out;
+
+ inode = path.dentry->d_inode;
+
+ error = mnt_want_write(path.mnt);
+ if (error)
+ goto dput_and_out;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (mode == (mode_t) -1)
+ mode = inode->i_mode;
+
+ newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+
+ error = notify_change(path.dentry, &newattrs);
+
+ mutex_unlock(&inode->i_mutex);
+ mnt_drop_write(path.mnt);
+dput_and_out:
+ path_put(&path);
+out:
+ return error;
+}
+
+int __init kern_sys_open(const char *filename, int flags, int mode)
+{
+ int fd;
+
+ if (force_o_largefile())
+ flags |= O_LARGEFILE;
+
+ fd = get_unused_fd_flags(flags);
+ if (fd >= 0) {
+ struct file *f = do_filp_open(AT_FDCWD, filename, flags,
+ mode, 0);
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = PTR_ERR(f);
+ } else {
+ fd_install(fd, f);
+ }
+ }
+ return fd;
+}
+
+int __init kern_sys_symlink(const char *oldname, const char *newname)
+{
+ int error;
+ struct dentry *dentry;
+ struct nameidata nd;
+
+ error = path_lookup(newname, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out_putname;
+
+ dentry = lookup_create(&nd, 0);
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
+
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+
+ error = vfs_symlink(nd.path.dentry->d_inode, dentry, oldname);
+
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ path_put(&nd.path);
+out_putname:
+ return error;
+}
+
+int __init kern_sys_lchown(const char *filename, uid_t user, gid_t group)
+{
+ int error;
+ struct path path;
+
+ error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+ if (error)
+ goto out;
+
+ error = mnt_want_write(path.mnt);
+ if (error)
+ goto out_release;
+
+ error = chown_common(&path, user, group);
+
+ mnt_drop_write(path.mnt);
+out_release:
+ path_put(&path);
+out:
+ return error;
+}
--
1.7.2.2
Add new config option USE_INIT_SYSCALL_AS_KERNEL_ROUTINE. This makes
some of kern_sys_*() functions call internal kernel routines directly
instead of calling syscall routine so that it can get rid of
user/kernel address space handling.
Signed-off-by: Namhyung Kim <[email protected]>
---
init/Makefile | 2 +
init/sys-wrapper.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
usr/Kconfig | 7 ++++++
3 files changed, 68 insertions(+), 0 deletions(-)
diff --git a/init/Makefile b/init/Makefile
index 0bf677a..296e5ab 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -15,6 +15,8 @@ mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o
mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o
mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o
+obj-$(CONFIG_USE_INIT_SYSCALL_AS_KERNEL_ROUTINE) += sys-wrapper.o
+
# dependencies on generated files need to be listed explicitly
$(obj)/version.o: include/generated/compile.h
diff --git a/init/sys-wrapper.h b/init/sys-wrapper.h
index e4227f9..38f9ec6 100644
--- a/init/sys-wrapper.h
+++ b/init/sys-wrapper.h
@@ -28,6 +28,8 @@
#include <linux/syscalls.h>
+#ifndef CONFIG_USE_INIT_SYSCALL_AS_KERNEL_ROUTINE
+
/* These macro are called just before/after actual syscalls. */
#define KSYS_PREPARE \
mm_segment_t old_fs = get_fs(); \
@@ -244,3 +246,60 @@ static inline int kern_sys_getdents64(unsigned int fd,
#undef KSYS_PREPARE
#undef KSYS_RESTORE
+
+#else /* !CONFIG_USE_INIT_SYSCALL_AS_KERNEL_ROUTINE */
+
+int kern_sys_link(const char *oldname, const char *newname);
+int kern_sys_unlink(const char *pathname);
+int kern_sys_newlstat(const char *filename, struct stat *statbuf);
+int kern_sys_mkdir(const char *pathname, int mode);
+int kern_sys_rmdir(const char *pathname);
+int kern_sys_mknod(const char *filename, int mode, unsigned dev);
+int kern_sys_chown(const char *filename, uid_t user, gid_t group);
+int kern_sys_chmod(const char *filename, mode_t mode);
+int kern_sys_open(const char *filename, int flags, int mode);
+
+static inline int kern_sys_fchown(unsigned int fd, uid_t user, gid_t group)
+{
+ return sys_fchown(fd, user, group);
+}
+
+static inline int kern_sys_fchmod(unsigned int fd, mode_t mode)
+{
+ return sys_fchmod(fd, mode);
+}
+
+static inline int kern_sys_ftruncate(unsigned int fd, unsigned long length)
+{
+ return sys_ftruncate(fd, length);
+}
+
+static inline int kern_sys_read(unsigned int fd, char *buf, size_t count)
+{
+ return sys_read(fd, (char __user __force *) buf, count);
+}
+
+static inline int kern_sys_write(unsigned int fd, const char *buf,
+ size_t count)
+{
+ return sys_write(fd, (const char __user __force *) buf, count);
+}
+
+static inline int kern_sys_close(unsigned int fd)
+{
+ return sys_close(fd);
+}
+
+int kern_sys_symlink(const char *oldname, const char *newname);
+int kern_sys_lchown(const char *filename, uid_t user, gid_t group);
+
+static inline int kern_sys_getdents64(unsigned int fd,
+ struct linux_dirent64 *dirent,
+ unsigned int count)
+{
+ return sys_getdents64(fd,
+ (struct linux_dirent64 __user __force *) dirent,
+ count);
+}
+
+#endif /* !CONFIG_USE_INIT_SYSCALL_AS_KERNEL_ROUTINE */
diff --git a/usr/Kconfig b/usr/Kconfig
index e2721f5..2a914eb 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -148,3 +148,10 @@ config INITRAMFS_COMPRESSION_LZO
(both compression and decompression) is the fastest.
endchoice
+
+config USE_INIT_SYSCALL_AS_KERNEL_ROUTINE
+ bool "Don't call syscalls on init code"
+ depends on BLK_DEV_INITRD
+ default n
+ help
+ replace syscalls to kernel functions in init code.
--
1.7.2.2
On Sunday 29 August 2010, Namhyung Kim wrote:
> First two of this patchset wrap all of syscall invocations with kern_sys_*()
> helper functions which does nasty address space conversions for you. This
> idea was suggested by Arnd Bergmann. Last one tries to implement above idea
> - calling internel functions directly - in favour of kernel config option
> even though I'm not sure this is right thing. :-(
I think we can safely say that we do not want the config option, we should
do one option or the other. Since Al already opposed implementing the calls
using low-level VFS operations, that's probably not going to happen.
Arnd
On Sunday 29 August 2010, Namhyung Kim wrote:
> +
> +/* These macro are called just before/after actual syscalls. */
> +#define KSYS_PREPARE \
> + mm_segment_t old_fs = get_fs(); \
> + set_fs(KERNEL_DS);
> +
> +#define KSYS_RESTORE \
> + set_fs(old_fs);
These macros are not that nice, because they depend on context.
I would probably open-code them in each function, or possibly
use a single macro to combine it to something like
#define kern_sys_call(call, ...) \
({ \
mm_segment_t old_fs = get_fs(); \
long result; \
set_fs(KERNEL_DS); \
result = call(__VA_ARGS__); \
set_fs(old_fs); \
result; \
})
static inline int kern_sys_link(const char *oldname, const char *newname)
{
return kern_sys_call(sys_link, (const char __user __force *)oldname,
(const char __user __force *)newname);
}
> +static inline int kern_sys_fchown(unsigned int fd, uid_t user, gid_t group)
> +{
> + int ret;
> + KSYS_PREPARE;
> +
> + ret = sys_fchown(fd, user, group);
> +
> + KSYS_RESTORE;
> + return ret;
> +}
When there are no pointer arguments, there is no need to do set_fs
tricks.
Arnd
On Mon, Aug 30, 2010 at 21:02, Arnd Bergmann <[email protected]> wrote:
> I think we can safely say that we do not want the config option, we should
> do one option or the other. Since Al already opposed implementing the calls
> using low-level VFS operations, that's probably not going to happen.
>
OK. Let's forget about it. :-)
--
Regards,
Namhyung Kim
On Mon, Aug 30, 2010 at 21:11, Arnd Bergmann <[email protected]> wrote:
> On Sunday 29 August 2010, Namhyung Kim wrote:
>> +
>> +/* These macro are called just before/after actual syscalls. */
>> +#define KSYS_PREPARE ? ? ? ? ? ? ? ? ? ? ? ? \
>> + ? ? mm_segment_t old_fs = get_fs(); ? ? ? ? \
>> + ? ? set_fs(KERNEL_DS);
>> +
>> +#define KSYS_RESTORE ? ? ? ? ? ? ? ? ? ? ? ? \
>> + ? ? set_fs(old_fs);
>
> These macros are not that nice, because they depend on context.
> I would probably open-code them in each function, or possibly
> use a single macro to combine it to something like
>
> #define kern_sys_call(call, ...) ? ? ? ?\
> ({ ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?\
> ? ? ? ?mm_segment_t old_fs = get_fs(); \
> ? ? ? ?long result; ? ? ? ? ? ? ? ? ? ?\
> ? ? ? ?set_fs(KERNEL_DS); ? ? ? ? ? ? ?\
> ? ? ? ?result = call(__VA_ARGS__); ? ? \
> ? ? ? ?set_fs(old_fs); ? ? ? ? ? ? ? ? \
> ? ? ? ?result; ? ? ? ? ? ? ? ? ? ? ? ? \
> })
>
> static inline int kern_sys_link(const char *oldname, const char *newname)
> {
> ? ? ? ?return kern_sys_call(sys_link, (const char __user __force *)oldname,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? (const char __user __force *)newname);
> }
>
Cool. Will use it. :-)
>> +static inline int kern_sys_fchown(unsigned int fd, uid_t user, gid_t group)
>> +{
>> + ? ? int ret;
>> + ? ? KSYS_PREPARE;
>> +
>> + ? ? ret = sys_fchown(fd, user, group);
>> +
>> + ? ? KSYS_RESTORE;
>> + ? ? return ret;
>> +}
>
> When there are no pointer arguments, there is no need to do set_fs
> tricks.
>
My intentions was it might be good, IMHO, if we have common setup/tear-down code
around actual syscall possibly extended in future. But now I think
it's a kind of over-
engineering so I'll discard it and follow your advice above.
Thanks.
--
Regards,
Namhyung Kim