Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751811AbaFMM2P (ORCPT ); Fri, 13 Jun 2014 08:28:15 -0400 Received: from mail-qc0-f175.google.com ([209.85.216.175]:45025 "EHLO mail-qc0-f175.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751056AbaFMM2N (ORCPT ); Fri, 13 Jun 2014 08:28:13 -0400 MIME-Version: 1.0 Reply-To: mtk.manpages@gmail.com In-Reply-To: <1402655819-14325-4-git-send-email-dh.herrmann@gmail.com> References: <1402655819-14325-1-git-send-email-dh.herrmann@gmail.com> <1402655819-14325-4-git-send-email-dh.herrmann@gmail.com> From: "Michael Kerrisk (man-pages)" Date: Fri, 13 Jun 2014 14:27:52 +0200 Message-ID: Subject: Re: [PATCH v3 3/7] shm: add memfd_create() syscall To: David Herrmann Cc: lkml , Ryan Lortie , Linus Torvalds , Andrew Morton , "linux-mm@kvack.org" , "linux-fsdevel@vger.kernel.org" , Linux API , Greg Kroah-Hartman , John Stultz , Lennart Poettering , Daniel Mack , Kay Sievers , Hugh Dickins , Tony Battersby , Andy Lutomirski Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hi David, On Fri, Jun 13, 2014 at 12:36 PM, David Herrmann wrote: > memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor > that you can pass to mmap(). It can support sealing and avoids any > connection to user-visible mount-points. Thus, it's not subject to quotas > on mounted file-systems, but can be used like malloc()'ed memory, but > with a file-descriptor to it. > > memfd_create() returns the raw shmem file, so calls like ftruncate() can > be used to modify the underlying inode. Also calls like fstat() > will return proper information and mark the file as regular file. If you > want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not > supported (like on all other regular files). > > Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not > subject to quotas and alike. It is still properly accounted to memcg > limits, though. Where do I find / is there detailed documentation (ideally, a man page) for this new system call? Cheers, Michael > > Signed-off-by: David Herrmann > --- > arch/x86/syscalls/syscall_32.tbl | 1 + > arch/x86/syscalls/syscall_64.tbl | 1 + > include/linux/syscalls.h | 1 + > include/uapi/linux/memfd.h | 8 +++++ > kernel/sys_ni.c | 1 + > mm/shmem.c | 72 ++++++++++++++++++++++++++++++++++++++++ > 6 files changed, 84 insertions(+) > create mode 100644 include/uapi/linux/memfd.h > > diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl > index d6b8679..e7495b4 100644 > --- a/arch/x86/syscalls/syscall_32.tbl > +++ b/arch/x86/syscalls/syscall_32.tbl > @@ -360,3 +360,4 @@ > 351 i386 sched_setattr sys_sched_setattr > 352 i386 sched_getattr sys_sched_getattr > 353 i386 renameat2 sys_renameat2 > +354 i386 memfd_create sys_memfd_create > diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl > index ec255a1..28be0e1 100644 > --- a/arch/x86/syscalls/syscall_64.tbl > +++ b/arch/x86/syscalls/syscall_64.tbl > @@ -323,6 +323,7 @@ > 314 common sched_setattr sys_sched_setattr > 315 common sched_getattr sys_sched_getattr > 316 common renameat2 sys_renameat2 > +317 common memfd_create sys_memfd_create > > # > # x32-specific system call numbers start at 512 to avoid cache impact > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index b0881a0..0be5d4d 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, > asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); > asmlinkage long sys_eventfd(unsigned int count); > asmlinkage long sys_eventfd2(unsigned int count, int flags); > +asmlinkage long sys_memfd_create(const char *uname_ptr, unsigned int flags); > asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); > asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); > asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, > diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h > new file mode 100644 > index 0000000..534e364 > --- /dev/null > +++ b/include/uapi/linux/memfd.h > @@ -0,0 +1,8 @@ > +#ifndef _UAPI_LINUX_MEMFD_H > +#define _UAPI_LINUX_MEMFD_H > + > +/* flags for memfd_create(2) (unsigned int) */ > +#define MFD_CLOEXEC 0x0001U > +#define MFD_ALLOW_SEALING 0x0002U > + > +#endif /* _UAPI_LINUX_MEMFD_H */ > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c > index 36441b5..489a4e6 100644 > --- a/kernel/sys_ni.c > +++ b/kernel/sys_ni.c > @@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime); > cond_syscall(compat_sys_timerfd_gettime); > cond_syscall(sys_eventfd); > cond_syscall(sys_eventfd2); > +cond_syscall(sys_memfd_create); > > /* performance counters: */ > cond_syscall(sys_perf_event_open); > diff --git a/mm/shmem.c b/mm/shmem.c > index 1438b3e..e7c5fe1 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt; > #include > #include > #include > +#include > #include > +#include > > #include > #include > @@ -2662,6 +2664,76 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) > shmem_show_mpol(seq, sbinfo->mpol); > return 0; > } > + > +#define MFD_NAME_PREFIX "memfd:" > +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) > +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) > + > +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) > + > +SYSCALL_DEFINE2(memfd_create, > + const char*, uname, > + unsigned int, flags) > +{ > + struct shmem_inode_info *info; > + struct file *file; > + int fd, error; > + char *name; > + long len; > + > + if (flags & ~(unsigned int)MFD_ALL_FLAGS) > + return -EINVAL; > + > + /* length includes terminating zero */ > + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); > + if (len <= 0) > + return -EFAULT; > + if (len > MFD_NAME_MAX_LEN + 1) > + return -EINVAL; > + > + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); > + if (!name) > + return -ENOMEM; > + > + strcpy(name, MFD_NAME_PREFIX); > + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { > + error = -EFAULT; > + goto err_name; > + } > + > + /* terminating-zero may have changed after strnlen_user() returned */ > + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { > + error = -EFAULT; > + goto err_name; > + } > + > + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); > + if (fd < 0) { > + error = fd; > + goto err_name; > + } > + > + file = shmem_file_setup(name, 0, VM_NORESERVE); > + if (IS_ERR(file)) { > + error = PTR_ERR(file); > + goto err_fd; > + } > + info = SHMEM_I(file_inode(file)); > + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; > + if (flags & MFD_ALLOW_SEALING) > + info->seals &= ~F_SEAL_SEAL; > + > + fd_install(fd, file); > + kfree(name); > + return fd; > + > +err_fd: > + put_unused_fd(fd); > +err_name: > + kfree(name); > + return error; > +} > + > #endif /* CONFIG_TMPFS */ > > static void shmem_put_super(struct super_block *sb) > -- > 2.0.0 > -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Linux/UNIX System Programming Training: http://man7.org/training/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/