Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
MIME-Version: 1.0
References: <20220601210951.3916598-1-axelrasmussen@google.com>
 <20220601210951.3916598-3-axelrasmussen@google.com> <Yqjc7t+XPtfWUVlw@xz-m1.local>
In-Reply-To: <Yqjc7t+XPtfWUVlw@xz-m1.local>
From:   Axel Rasmussen <axelrasmussen@google.com>
Date:   Tue, 14 Jun 2022 17:53:49 -0700
Message-ID: <CAJHvVcg+r+mmdODWKH3YpA7VyRs+H-UfaB3W2JOsGj0V=2=yHg@mail.gmail.com>
Subject: Re: [PATCH v3 2/6] userfaultfd: add /dev/userfaultfd for fine grained
 access control
To:     Peter Xu <peterx@redhat.com>
Cc:     Alexander Viro <viro@zeniv.linux.org.uk>,
        Andrew Morton <akpm@linux-foundation.org>,
        Charan Teja Reddy <charante@codeaurora.org>,
        Dave Hansen <dave.hansen@linux.intel.com>,
        "Dmitry V . Levin" <ldv@altlinux.org>,
        Gleb Fotengauer-Malinovskiy <glebfm@altlinux.org>,
        Hugh Dickins <hughd@google.com>, Jan Kara <jack@suse.cz>,
        Jonathan Corbet <corbet@lwn.net>,
        Mel Gorman <mgorman@techsingularity.net>,
        Mike Kravetz <mike.kravetz@oracle.com>,
        Mike Rapoport <rppt@kernel.org>, Nadav Amit <namit@vmware.com>,
        Shuah Khan <shuah@kernel.org>,
        Suren Baghdasaryan <surenb@google.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        zhangyi <yi.zhang@huawei.com>, linux-doc@vger.kernel.org,
        linux-fsdevel@vger.kernel.org, LKML <linux-kernel@vger.kernel.org>,
        Linux MM <linux-mm@kvack.org>,
        Linuxkselftest <linux-kselftest@vger.kernel.org>
Content-Type: text/plain; charset="UTF-8"
Precedence: bulk

On Tue, Jun 14, 2022 at 12:09 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Jun 01, 2022 at 02:09:47PM -0700, Axel Rasmussen wrote:
> > Historically, it has been shown that intercepting kernel faults with
> > userfaultfd (thereby forcing the kernel to wait for an arbitrary amount
> > of time) can be exploited, or at least can make some kinds of exploits
> > easier. So, in 37cd0575b8 "userfaultfd: add UFFD_USER_MODE_ONLY" we
> > changed things so, in order for kernel faults to be handled by
> > userfaultfd, either the process needs CAP_SYS_PTRACE, or this sysctl
> > must be configured so that any unprivileged user can do it.
> >
> > In a typical implementation of a hypervisor with live migration (take
> > QEMU/KVM as one such example), we do indeed need to be able to handle
> > kernel faults. But, both options above are less than ideal:
> >
> > - Toggling the sysctl increases attack surface by allowing any
> >   unprivileged user to do it.
> >
> > - Granting the live migration process CAP_SYS_PTRACE gives it this
> >   ability, but *also* the ability to "observe and control the
> >   execution of another process [...], and examine and change [its]
> >   memory and registers" (from ptrace(2)). This isn't something we need
> >   or want to be able to do, so granting this permission violates the
> >   "principle of least privilege".
> >
> > This is all a long winded way to say: we want a more fine-grained way to
> > grant access to userfaultfd, without granting other additional
> > permissions at the same time.
> >
> > To achieve this, add a /dev/userfaultfd misc device. This device
> > provides an alternative to the userfaultfd(2) syscall for the creation
> > of new userfaultfds. The idea is, any userfaultfds created this way will
> > be able to handle kernel faults, without the caller having any special
> > capabilities. Access to this mechanism is instead restricted using e.g.
> > standard filesystem permissions.
> >
> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
> > ---
> >  fs/userfaultfd.c                 | 76 ++++++++++++++++++++++++++------
> >  include/uapi/linux/userfaultfd.h |  4 ++
> >  2 files changed, 66 insertions(+), 14 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index e943370107d0..8b92c1398169 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -30,6 +30,7 @@
> >  #include <linux/security.h>
> >  #include <linux/hugetlb.h>
> >  #include <linux/swapops.h>
> > +#include <linux/miscdevice.h>
> >
> >  int sysctl_unprivileged_userfaultfd __read_mostly;
> >
> > @@ -413,13 +414,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
> >
> >       if (ctx->features & UFFD_FEATURE_SIGBUS)
> >               goto out;
> > -     if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
> > -         ctx->flags & UFFD_USER_MODE_ONLY) {
> > -             printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
> > -                     "sysctl knob to 1 if kernel faults must be handled "
> > -                     "without obtaining CAP_SYS_PTRACE capability\n");
> > +     if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
> >               goto out;
> > -     }
> >
> >       /*
> >        * If it's already released don't get it. This avoids to loop
> > @@ -2052,19 +2048,33 @@ static void init_once_userfaultfd_ctx(void *mem)
> >       seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
> >  }
> >
> > -SYSCALL_DEFINE1(userfaultfd, int, flags)
> > +static inline bool userfaultfd_allowed(bool is_syscall, int flags)
> > +{
> > +     bool kernel_faults = !(flags & UFFD_USER_MODE_ONLY);
> > +     bool allow_unprivileged = sysctl_unprivileged_userfaultfd;
> > +
> > +     /* userfaultfd(2) access is controlled by sysctl + capability. */
> > +     if (is_syscall && kernel_faults) {
> > +             if (!allow_unprivileged && !capable(CAP_SYS_PTRACE))
> > +                     return false;
> > +     }
> > +
> > +     /*
> > +      * For /dev/userfaultfd, access is to be controlled using e.g.
> > +      * permissions on the device node. We assume this is correctly
> > +      * configured by userspace, so we simply allow access here.
> > +      */
> > +
> > +     return true;
> > +}
>
> This helper reads a bit weird because potentially it constantly returns
> "true" for !syscall use case but it's very not obvious..
>
> Would it be cleaner to not pass in the bool at all?  Something like (I also
> un-nested some of the condition checks, hopefully it'll be easier to read):
>
> bool userfaultfd_syscall_allowed(int flags)
> {
>         /* Userspace-only page faults are always allowed */
>         if (flags & UFFD_USER_MODE_ONLY)
>                 return true;
>
>         /*
>          * The user is requesting kernel fault capabilities. Privileged
>          * users are always allowed even for kernel fault traps.
>          */
>         if (capable(CAP_SYS_PTRACE))
>                 return true;
>
>         /* Whether we allow unprivileged users for kernel faults? */
>         return sysctl_unprivileged_userfaultfd;
> }
>
> Then below...
>
> > +
> > +static int new_userfaultfd(bool is_syscall, int flags)
> >  {
> >       struct userfaultfd_ctx *ctx;
> >       int fd;
> >
> > -     if (!sysctl_unprivileged_userfaultfd &&
> > -         (flags & UFFD_USER_MODE_ONLY) == 0 &&
> > -         !capable(CAP_SYS_PTRACE)) {
> > -             printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
> > -                     "sysctl knob to 1 if kernel faults must be handled "
> > -                     "without obtaining CAP_SYS_PTRACE capability\n");
> > +     if (!userfaultfd_allowed(is_syscall, flags))
> >               return -EPERM;
> > -     }
>
> .. we could write it as:
>
>         if (is_syscall && !userfaultfd_syscall_allowed(flags))
>                 return -EPERM;
>
> What do you think?
>
> >
> >       BUG_ON(!current->mm);
> >
> > @@ -2083,6 +2093,10 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> >       refcount_set(&ctx->refcount, 1);
> >       ctx->flags = flags;
> >       ctx->features = 0;
> > +     /*
> > +      * If UFFD_USER_MODE_ONLY is not set, then userfaultfd_allowed() above
> > +      * decided that kernel faults were allowed and should be handled.
> > +      */
>
> Hmm.. why this needs to be added above "released=false"? Did you want to
> add this (perhaps) above "flags" instead?
>
> IMHO when people reading the flags it'll be clear already on how it was
> handled, the thing is the comment probably hide deep anyway so I'd consider
> omitting it.
>
> The rest looks good to me, thanks.

Thanks for reviewing, Peter! Most of these comments look good to me,
I'll include them in a v4 after I get back to the office in about a
week.

>
> >       ctx->released = false;
> >       atomic_set(&ctx->mmap_changing, 0);
> >       ctx->mm = current->mm;
> > @@ -2098,8 +2112,42 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> >       return fd;
> >  }
> >
> > +SYSCALL_DEFINE1(userfaultfd, int, flags)
> > +{
> > +     return new_userfaultfd(true, flags);
> > +}
> > +
> > +static int userfaultfd_dev_open(struct inode *inode, struct file *file)
> > +{
> > +     return 0;
> > +}
> > +
> > +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
> > +{
> > +     if (cmd != USERFAULTFD_IOC_NEW)
> > +             return -EINVAL;
> > +
> > +     return new_userfaultfd(false, flags);
> > +}
> > +
> > +static const struct file_operations userfaultfd_dev_fops = {
> > +     .open = userfaultfd_dev_open,
> > +     .unlocked_ioctl = userfaultfd_dev_ioctl,
> > +     .compat_ioctl = userfaultfd_dev_ioctl,
> > +     .owner = THIS_MODULE,
> > +     .llseek = noop_llseek,
> > +};
> > +
> > +static struct miscdevice userfaultfd_misc = {
> > +     .minor = MISC_DYNAMIC_MINOR,
> > +     .name = "userfaultfd",
> > +     .fops = &userfaultfd_dev_fops
> > +};
> > +
> >  static int __init userfaultfd_init(void)
> >  {
> > +     WARN_ON(misc_register(&userfaultfd_misc));
> > +
> >       userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
> >                                               sizeof(struct userfaultfd_ctx),
> >                                               0,
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 7d32b1e797fb..005e5e306266 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -12,6 +12,10 @@
> >
> >  #include <linux/types.h>
> >
> > +/* ioctls for /dev/userfaultfd */
> > +#define USERFAULTFD_IOC 0xAA
> > +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
> > +
> >  /*
> >   * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
> >   * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
> > --
> > 2.36.1.255.ge46751e96f-goog
> >
>
> --
> Peter Xu
>