2023-09-28 20:36:45

by Paolo Bonzini

[permalink] [raw]
Subject: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

Use a run-of-the-mill anonymous inode, there is nothing useful
being provided by kvm_gmem_fs.

Signed-off-by: Paolo Bonzini <[email protected]>
---
include/uapi/linux/magic.h | 1 -
virt/kvm/guest_mem.c | 110 ++++++++++---------------------------
virt/kvm/kvm_main.c | 6 --
3 files changed, 30 insertions(+), 87 deletions(-)

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index afe9c376c9a5..6325d1d0e90f 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,6 +101,5 @@
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
-#define KVM_GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */

#endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index a819367434e9..73b841a2e1b1 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -3,14 +3,10 @@
#include <linux/falloc.h>
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
-#include <linux/pseudo_fs.h>
-
-#include <uapi/linux/magic.h>
+#include <linux/anon_inodes.h>

#include "kvm_mm.h"

-static struct vfsmount *kvm_gmem_mnt;
-
struct kvm_gmem {
struct kvm *kvm;
struct xarray bindings;
@@ -356,23 +352,40 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr = kvm_gmem_setattr,
};

-static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags,
- struct vfsmount *mnt)
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
const char *anon_name = "[kvm-gmem]";
- const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
struct kvm_gmem *gmem;
struct inode *inode;
struct file *file;
int fd, err;

- inode = alloc_anon_inode(mnt->mnt_sb);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ fd = get_unused_fd_flags(0);
+ if (fd < 0)
+ return fd;

- err = security_inode_init_security_anon(inode, &qname, NULL);
- if (err)
- goto err_inode;
+ gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+ if (!gmem) {
+ err = -ENOMEM;
+ goto err_fd;
+ }
+
+ file = anon_inode_getfile(anon_name, &kvm_gmem_fops, gmem,
+ O_RDWR);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_gmem;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ kvm_get_kvm(kvm);
+ gmem->kvm = kvm;
+ xa_init(&gmem->bindings);
+ list_add(&gmem->entry, &file->f_mapping->private_list);
+
+ inode = file->f_inode;
+ WARN_ON(file->f_mapping != inode->i_mapping);

inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
@@ -385,44 +398,13 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags,
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));

- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- err = fd;
- goto err_inode;
- }
-
- file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, &kvm_gmem_fops);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto err_fd;
- }
-
- file->f_flags |= O_LARGEFILE;
- file->f_mapping = inode->i_mapping;
-
- gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
- if (!gmem) {
- err = -ENOMEM;
- goto err_file;
- }
-
- kvm_get_kvm(kvm);
- gmem->kvm = kvm;
- xa_init(&gmem->bindings);
-
- file->private_data = gmem;
-
- list_add(&gmem->entry, &inode->i_mapping->private_list);
-
fd_install(fd, file);
return fd;

-err_file:
- fput(file);
+err_gmem:
+ kfree(gmem);
err_fd:
put_unused_fd(fd);
-err_inode:
- iput(inode);
return err;
}

@@ -455,7 +437,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
if (!kvm_gmem_is_valid_size(size, flags))
return -EINVAL;

- return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
+ return __kvm_gmem_create(kvm, size, flags);
}

int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -603,35 +585,3 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
-
-static int kvm_gmem_init_fs_context(struct fs_context *fc)
-{
- if (!init_pseudo(fc, KVM_GUEST_MEMORY_MAGIC))
- return -ENOMEM;
-
- return 0;
-}
-
-static struct file_system_type kvm_gmem_fs = {
- .name = "kvm_guest_memory",
- .init_fs_context = kvm_gmem_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-int kvm_gmem_init(void)
-{
- kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
- if (IS_ERR(kvm_gmem_mnt))
- return PTR_ERR(kvm_gmem_mnt);
-
- /* For giggles. Userspace can never map this anyways. */
- kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
-
- return 0;
-}
-
-void kvm_gmem_exit(void)
-{
- kern_unmount(kvm_gmem_mnt);
- kvm_gmem_mnt = NULL;
-}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a83dfef1316e..4a1ded1faf84 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -6424,10 +6424,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
goto err_async_pf;

- r = kvm_gmem_init();
- if (r)
- goto err_gmem;
-
kvm_chardev_ops.owner = module;

kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6454,8 +6450,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
err_register:
kvm_vfio_ops_exit();
err_vfio:
- kvm_gmem_exit();
-err_gmem:
kvm_async_pf_deinit();
err_async_pf:
kvm_irqfd_exit();
--
2.39.1


2023-09-29 06:29:52

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Thu, 28 Sep 2023 14:06:51 -0400, Paolo Bonzini wrote:
> Use a run-of-the-mill anonymous inode, there is nothing useful
> being provided by kvm_gmem_fs.
>
>

Applied to kvm-x86 guest_memfd, thanks!

[1/1] kvm: guestmem: do not use a file system
https://github.com/kvm-x86/linux/commit/0f7e60a5f42a

--
https://github.com/kvm-x86/linux/tree/next

2023-10-09 02:16:57

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Thu, Sep 28, 2023 at 02:06:51PM -0400, Paolo Bonzini wrote:
> Use a run-of-the-mill anonymous inode, there is nothing useful
> being provided by kvm_gmem_fs.

> - inode = alloc_anon_inode(mnt->mnt_sb);
> - if (IS_ERR(inode))
> - return PTR_ERR(inode);
> + fd = get_unused_fd_flags(0);
> + if (fd < 0)
> + return fd;
>
> - err = security_inode_init_security_anon(inode, &qname, NULL);
> - if (err)
> - goto err_inode;
> + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> + if (!gmem) {
> + err = -ENOMEM;
> + goto err_fd;
> + }
> +
> + file = anon_inode_getfile(anon_name, &kvm_gmem_fops, gmem,
> + O_RDWR);

> + inode = file->f_inode;
> + WARN_ON(file->f_mapping != inode->i_mapping);
>
> inode->i_private = (void *)(unsigned long)flags;
> inode->i_op = &kvm_gmem_iops;

That's very badly broken. The whole point of anon_inode_getfile() is
that *ALL* resulting files share the same inode. You are not allowed
to modify the damn thing.

2023-10-09 02:23:03

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Thu, Sep 28, 2023 at 07:22:16PM -0700, Sean Christopherson wrote:
> On Thu, 28 Sep 2023 14:06:51 -0400, Paolo Bonzini wrote:
> > Use a run-of-the-mill anonymous inode, there is nothing useful
> > being provided by kvm_gmem_fs.
> >
> >
>
> Applied to kvm-x86 guest_memfd, thanks!
>
> [1/1] kvm: guestmem: do not use a file system
> https://github.com/kvm-x86/linux/commit/0f7e60a5f42a

Please, revert; this is completely broken. anon_inode_getfile()
yields a file with the same ->f_inode every time it is called.

Again, ->f_inode of those things is shared to hell and back,
very much by design. You can't modify its ->i_op or anything
other field, for that matter. No information can be stored
in that thing - you are only allowed to use the object you've
passed via 'priv' argument.

NAKed-by: Al Viro <[email protected]>

2023-10-09 02:39:02

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023 at 03:22:48AM +0100, Al Viro wrote:
> On Thu, Sep 28, 2023 at 07:22:16PM -0700, Sean Christopherson wrote:
> > On Thu, 28 Sep 2023 14:06:51 -0400, Paolo Bonzini wrote:
> > > Use a run-of-the-mill anonymous inode, there is nothing useful
> > > being provided by kvm_gmem_fs.
> > >
> > >
> >
> > Applied to kvm-x86 guest_memfd, thanks!
> >
> > [1/1] kvm: guestmem: do not use a file system
> > https://github.com/kvm-x86/linux/commit/0f7e60a5f42a
>
> Please, revert; this is completely broken. anon_inode_getfile()
> yields a file with the same ->f_inode every time it is called.
>
> Again, ->f_inode of those things is shared to hell and back,
> very much by design. You can't modify its ->i_op or anything
> other field, for that matter. No information can be stored
> in that thing - you are only allowed to use the object you've
> passed via 'priv' argument.

BTW, note that all those suckers will end up sharing the page
cache; looks like that's not what you want to get...

> NAKed-by: Al Viro <[email protected]>

2023-10-09 14:33:10

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023, Al Viro wrote:
> On Thu, Sep 28, 2023 at 07:22:16PM -0700, Sean Christopherson wrote:
> > On Thu, 28 Sep 2023 14:06:51 -0400, Paolo Bonzini wrote:
> > > Use a run-of-the-mill anonymous inode, there is nothing useful
> > > being provided by kvm_gmem_fs.
> > >
> > >
> >
> > Applied to kvm-x86 guest_memfd, thanks!
> >
> > [1/1] kvm: guestmem: do not use a file system
> > https://github.com/kvm-x86/linux/commit/0f7e60a5f42a
>
> Please, revert; this is completely broken. anon_inode_getfile()
> yields a file with the same ->f_inode every time it is called.
>
> Again, ->f_inode of those things is shared to hell and back,
> very much by design. You can't modify its ->i_op or anything
> other field, for that matter. No information can be stored
> in that thing - you are only allowed to use the object you've
> passed via 'priv' argument.

Yeah, we found that out the hard way. Is using the "secure" variant to get a
per-file inode a sane approach, or is that abuse that's going to bite us too?

/*
* Use the so called "secure" variant, which creates a unique inode
* instead of reusing a single inode. Each guest_memfd instance needs
* its own inode to track the size, flags, etc.
*/
file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
O_RDWR, NULL);

2023-10-09 20:07:00

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:

> Yeah, we found that out the hard way. Is using the "secure" variant to get a
> per-file inode a sane approach, or is that abuse that's going to bite us too?
>
> /*
> * Use the so called "secure" variant, which creates a unique inode
> * instead of reusing a single inode. Each guest_memfd instance needs
> * its own inode to track the size, flags, etc.
> */
> file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> O_RDWR, NULL);

Umm... Is there any chance that your call site will ever be in a module?
If not, you are probably OK with that variant. I don't like the details
of that interface (anon_inode_getfile_secure(), that is), but that's
a separate story and your use wouldn't make things harder to clean up.

2023-10-09 20:20:23

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023, Al Viro wrote:
> On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:
>
> > Yeah, we found that out the hard way. Is using the "secure" variant to get a
> > per-file inode a sane approach, or is that abuse that's going to bite us too?
> >
> > /*
> > * Use the so called "secure" variant, which creates a unique inode
> > * instead of reusing a single inode. Each guest_memfd instance needs
> > * its own inode to track the size, flags, etc.
> > */
> > file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> > O_RDWR, NULL);
>
> Umm... Is there any chance that your call site will ever be in a module?
> If not, you are probably OK with that variant.

Yes, this code can be compiled as a module. I assume there issues with the inode
outliving the module?

2023-10-09 20:42:03

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023 at 01:20:06PM -0700, Sean Christopherson wrote:
> On Mon, Oct 09, 2023, Al Viro wrote:
> > On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:
> >
> > > Yeah, we found that out the hard way. Is using the "secure" variant to get a
> > > per-file inode a sane approach, or is that abuse that's going to bite us too?
> > >
> > > /*
> > > * Use the so called "secure" variant, which creates a unique inode
> > > * instead of reusing a single inode. Each guest_memfd instance needs
> > > * its own inode to track the size, flags, etc.
> > > */
> > > file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> > > O_RDWR, NULL);
> >
> > Umm... Is there any chance that your call site will ever be in a module?
> > If not, you are probably OK with that variant.
>
> Yes, this code can be compiled as a module. I assume there issues with the inode
> outliving the module?

The entire file, actually... If you are using that mechanism in a module, you
need to initialize kvm_gmem_fops.owner to THIS_MODULE; AFAICS, you don't have
that done.

2023-10-09 21:27:03

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023, Al Viro wrote:
> On Mon, Oct 09, 2023 at 01:20:06PM -0700, Sean Christopherson wrote:
> > On Mon, Oct 09, 2023, Al Viro wrote:
> > > On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:
> > >
> > > > Yeah, we found that out the hard way. Is using the "secure" variant to get a
> > > > per-file inode a sane approach, or is that abuse that's going to bite us too?
> > > >
> > > > /*
> > > > * Use the so called "secure" variant, which creates a unique inode
> > > > * instead of reusing a single inode. Each guest_memfd instance needs
> > > > * its own inode to track the size, flags, etc.
> > > > */
> > > > file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> > > > O_RDWR, NULL);
> > >
> > > Umm... Is there any chance that your call site will ever be in a module?
> > > If not, you are probably OK with that variant.
> >
> > Yes, this code can be compiled as a module. I assume there issues with the inode
> > outliving the module?
>
> The entire file, actually... If you are using that mechanism in a module, you
> need to initialize kvm_gmem_fops.owner to THIS_MODULE; AFAICS, you don't have
> that done.

Ah, that's handled indirectly handled by a chain of refcounted objects. Every
VM that KVM creates gets a reference to the module, and each guest_memfd instance
gets a reference to its owning VM.

Thanks much for the help!

2023-10-10 00:09:33

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023 at 02:26:36PM -0700, Sean Christopherson wrote:
> On Mon, Oct 09, 2023, Al Viro wrote:
> > On Mon, Oct 09, 2023 at 01:20:06PM -0700, Sean Christopherson wrote:
> > > On Mon, Oct 09, 2023, Al Viro wrote:
> > > > On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:
> > > >
> > > > > Yeah, we found that out the hard way. Is using the "secure" variant to get a
> > > > > per-file inode a sane approach, or is that abuse that's going to bite us too?
> > > > >
> > > > > /*
> > > > > * Use the so called "secure" variant, which creates a unique inode
> > > > > * instead of reusing a single inode. Each guest_memfd instance needs
> > > > > * its own inode to track the size, flags, etc.
> > > > > */
> > > > > file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> > > > > O_RDWR, NULL);
> > > >
> > > > Umm... Is there any chance that your call site will ever be in a module?
> > > > If not, you are probably OK with that variant.
> > >
> > > Yes, this code can be compiled as a module. I assume there issues with the inode
> > > outliving the module?
> >
> > The entire file, actually... If you are using that mechanism in a module, you
> > need to initialize kvm_gmem_fops.owner to THIS_MODULE; AFAICS, you don't have
> > that done.
>
> Ah, that's handled indirectly handled by a chain of refcounted objects. Every
> VM that KVM creates gets a reference to the module, and each guest_memfd instance
> gets a reference to its owning VM.

Umm... what's the usual call chain leading to final drop of refcount of that
module?

2023-10-10 00:27:13

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Tue, Oct 10, 2023, Al Viro wrote:
> On Mon, Oct 09, 2023 at 02:26:36PM -0700, Sean Christopherson wrote:
> > On Mon, Oct 09, 2023, Al Viro wrote:
> > > On Mon, Oct 09, 2023 at 01:20:06PM -0700, Sean Christopherson wrote:
> > > > On Mon, Oct 09, 2023, Al Viro wrote:
> > > > > On Mon, Oct 09, 2023 at 07:32:48AM -0700, Sean Christopherson wrote:
> > > > >
> > > > > > Yeah, we found that out the hard way. Is using the "secure" variant to get a
> > > > > > per-file inode a sane approach, or is that abuse that's going to bite us too?
> > > > > >
> > > > > > /*
> > > > > > * Use the so called "secure" variant, which creates a unique inode
> > > > > > * instead of reusing a single inode. Each guest_memfd instance needs
> > > > > > * its own inode to track the size, flags, etc.
> > > > > > */
> > > > > > file = anon_inode_getfile_secure(anon_name, &kvm_gmem_fops, gmem,
> > > > > > O_RDWR, NULL);
> > > > >
> > > > > Umm... Is there any chance that your call site will ever be in a module?
> > > > > If not, you are probably OK with that variant.
> > > >
> > > > Yes, this code can be compiled as a module. I assume there issues with the inode
> > > > outliving the module?
> > >
> > > The entire file, actually... If you are using that mechanism in a module, you
> > > need to initialize kvm_gmem_fops.owner to THIS_MODULE; AFAICS, you don't have
> > > that done.
> >
> > Ah, that's handled indirectly handled by a chain of refcounted objects. Every
> > VM that KVM creates gets a reference to the module, and each guest_memfd instance
> > gets a reference to its owning VM.
>
> Umm... what's the usual call chain leading to final drop of refcount of that
> module?

If the last reference is effectively held by guest_memfd, it would be:

kvm_gmem_release(), a.k.a. file_operations.release()
|
-> kvm_put_kvm()
|
-> kvm_destroy_vm()
|
-> module_put(kvm_chardev_ops.owner);

2023-10-10 00:38:09

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Mon, Oct 09, 2023 at 05:27:04PM -0700, Sean Christopherson wrote:

> If the last reference is effectively held by guest_memfd, it would be:
>
> kvm_gmem_release(), a.k.a. file_operations.release()
> |
> -> kvm_put_kvm()
> |
> -> kvm_destroy_vm()
> |
> -> module_put(kvm_chardev_ops.owner);

... and now your thread gets preempted and loses CPU; before you get
it back, some joker calls delete_module(), and page of code containing
kvm_gmem_release() is unmapped. Even though an address within that
page is stored as return address in a frame on your thread's stack.
That thread gets the timeslice again and proceeds to return into
unmapped page. Oops...

2023-10-10 23:31:39

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Tue, Oct 10, 2023, Al Viro wrote:
> On Mon, Oct 09, 2023 at 05:27:04PM -0700, Sean Christopherson wrote:
>
> > If the last reference is effectively held by guest_memfd, it would be:
> >
> > kvm_gmem_release(), a.k.a. file_operations.release()
> > |
> > -> kvm_put_kvm()
> > |
> > -> kvm_destroy_vm()
> > |
> > -> module_put(kvm_chardev_ops.owner);
>
> ... and now your thread gets preempted and loses CPU; before you get
> it back, some joker calls delete_module(), and page of code containing
> kvm_gmem_release() is unmapped. Even though an address within that
> page is stored as return address in a frame on your thread's stack.
> That thread gets the timeslice again and proceeds to return into
> unmapped page. Oops...

*sigh*

What an absolute snafu. Sorry for the wall of text. Feel free to stop reading
after the "Back to KVM..." part below, it's just gory details on how we screwed
things up in KVM.

But one question before I dive into the KVM mess: other than error paths, how is
module_put(THIS_MODULE) ever safe without being superfluous? I don't see how a
module can put the last reference to itself without either hitting the above
scenario, or without creating deadlock. Something other than the module must put
the last reference, no?

The only exceptions I see are:

1. if module_put() is called via async_run_entry_fn(), as delete_module() invokes
async_synchronize_full() before unmapping the module. But IIUC, the async
framework uses workqueues, not the other way around. I.e. delete_module()
doesn't flush arbitrary workqueues.

2. if module_put() is called via module_put_and_kthread_exit(), which uses
THIS_MODULE but does module_put() from a core kernel helper and never returns
to the module's kthread, i.e. doesn't return to module code.

But then this

$ git grep -E "module_put\(THIS_MODULE" | wc -l
132

make me go "huh"? I've blamed a handful of those calls, and I can't find a single
one that provides any clue as to why the module gets/puts references to itself,
let alone actually justifies the usage.

E.g. drivers/block/loop.c has this gem

/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);

in __loop_clr_fd(), which is invoked from a .release() function. So open() quite
clearly doesn't hold a reference, unless the comment is talking about the reference
that was obtained by the core file systems layer and won't be put until after
.release() completes. But then what on earth is the point of doing
module_get(THIS_MODULE) and module_put(THIS_MODULE)?


Back to KVM...

Commit 5f6de5cbebee ("KVM: Prevent module exit until all VMs are freed") *tried*
to fix a bug where KVM-the-module could be unloaded while a KVM workqueue callback
was still in-flight. The callback had a reference to the VM, but not to the VM's
file representation.

After that commit went in, I suggested dropping the use of .owner for VMs and
vCPUs (each of which is represented by an anon inode file) because keeping the VM
alive would pin KVM-the-module until all VMs went away. But I missed the
obvious-in-hindsight issue Al highlighted above.

Fixing that particular wart is relatively easy: revert commit 70375c2d8fa3 ("Revert
"KVM: set owner of cpu and vm file operations""), and give all of the other KVM-owned
file_operations structures the same treatment by setting .owner correctly. Note,
"correctly" isn't THIS_MODULE in most cases, which is why the code existing is a
bit odd. For most file_operations, on x86 and PPC (and MIPS?), the effective owner
is actually a sub-module, e.g. THIS_MODULE will point at kvm.ko, but on x86 the
effective owner is either kvm-intel.ko or kvm-amd.ko (which holds a reference to
kvm.ko).

After staring and fiddling for most of today, I finally discovered that grabbing
a reference to the module on behalf of the work item didn't fix the actual bugs,
plural, it just shuffled the deck chairs on the Titanic. And as above, it set us
up to make even bigger mistakes regarding .owner :-(

The problematic code is effectively kvm_clear_async_pf_completion_queue(). That
helper is called for each vCPU when a VM is being destroyed, i.e. when the last
reference to a VM is put via kvm_put_kvm(). Clearing the queue *should* also
flush all work items, except it doesn't when the work is "done", where "done" just
means the page being faulted in is ready. Using file_operations.owner doesn't
solve anything, e.g. even if async_pf_execute() were gifted a reference to the
VM's file and used the deferred fput(), the same preemption issue exists, it's
just slightly harder to hit.

The original async #PF code appears to have fudged around the lack of flushing by
gifting a VM reference to the async_pf_execute(). Or maybe it was the other way
around and not flushing was a workaround for the deadlock that occurs if
kvm_clear_async_pf_completion_queue() does flush the workqueue. If kvm_put_kvm()
is called from async_pf_execute() and kvm_put_kvm() flushes the async #PF workqueue,
deadlock occurs becase async_pf_execute() can't return until kvm_put_kvm() finishes,
and kvm_put_kvm() can't return until async_pf_execute() finishes.

WARNING: CPU: 8 PID: 251 at virt/kvm/kvm_main.c:1435 kvm_put_kvm+0x2d/0x320 [kvm]
Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel kvm irqbypass
CPU: 8 PID: 251 Comm: kworker/8:1 Tainted: G W 6.6.0-rc1-e7af8d17224a-x86/gmem-vm #119
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
Workqueue: events async_pf_execute [kvm]
RIP: 0010:kvm_put_kvm+0x2d/0x320 [kvm]
Call Trace:
<TASK>
async_pf_execute+0x198/0x260 [kvm]
process_one_work+0x145/0x2d0
worker_thread+0x27e/0x3a0
kthread+0xba/0xe0
ret_from_fork+0x2d/0x50
ret_from_fork_asm+0x11/0x20
</TASK>
---[ end trace 0000000000000000 ]---
INFO: task kworker/8:1:251 blocked for more than 120 seconds.
Tainted: G W 6.6.0-rc1-e7af8d17224a-x86/gmem-vm #119
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/8:1 state:D stack:0 pid:251 ppid:2 flags:0x00004000
Workqueue: events async_pf_execute [kvm]
Call Trace:
<TASK>
__schedule+0x33f/0xa40
schedule+0x53/0xc0
schedule_timeout+0x12a/0x140
__wait_for_common+0x8d/0x1d0
__flush_work.isra.0+0x19f/0x2c0
kvm_clear_async_pf_completion_queue+0x129/0x190 [kvm]
kvm_arch_destroy_vm+0x78/0x1b0 [kvm]
kvm_put_kvm+0x1c1/0x320 [kvm]
async_pf_execute+0x198/0x260 [kvm]
process_one_work+0x145/0x2d0
worker_thread+0x27e/0x3a0
kthread+0xba/0xe0
ret_from_fork+0x2d/0x50
ret_from_fork_asm+0x11/0x20
</TASK>

If kvm_clear_async_pf_completion_queue() actually flushes the workqueue, then
there's no need to gift async_pf_execute() a reference because all invocations
of async_pf_execute() will be forced to complete before the vCPU and its VM are
destroyed/freed. And that also fixes the module unloading mess because __fput()
won't do module_put() on the last vCPU reference until the vCPU has been freed.

The attached patches are lightly tested, but I think they fix the KVM mess. I
likely won't post a proper series until next week, I'm going to be offline the
next two days.


Attachments:
(No filename) (7.24 kB)
0001-KVM-Set-file_operations.owner-appropriately-for-all-.patch (3.00 kB)
0002-KVM-Always-flush-async-PF-workqueue-when-vCPU-is-bei.patch (2.02 kB)
0003-Revert-KVM-Prevent-module-exit-until-all-VMs-are-fre.patch (2.06 kB)
Download all attachments

2023-10-11 15:09:51

by Xu Yilun

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Tue, Oct 10, 2023 at 04:30:18PM -0700, Sean Christopherson wrote:
> On Tue, Oct 10, 2023, Al Viro wrote:
> > On Mon, Oct 09, 2023 at 05:27:04PM -0700, Sean Christopherson wrote:
> >
> > > If the last reference is effectively held by guest_memfd, it would be:
> > >
> > > kvm_gmem_release(), a.k.a. file_operations.release()
> > > |
> > > -> kvm_put_kvm()
> > > |
> > > -> kvm_destroy_vm()
> > > |
> > > -> module_put(kvm_chardev_ops.owner);
> >
> > ... and now your thread gets preempted and loses CPU; before you get
> > it back, some joker calls delete_module(), and page of code containing
> > kvm_gmem_release() is unmapped. Even though an address within that
> > page is stored as return address in a frame on your thread's stack.
> > That thread gets the timeslice again and proceeds to return into
> > unmapped page. Oops...
>
> *sigh*
>
> What an absolute snafu. Sorry for the wall of text. Feel free to stop reading
> after the "Back to KVM..." part below, it's just gory details on how we screwed
> things up in KVM.
>
> But one question before I dive into the KVM mess: other than error paths, how is
> module_put(THIS_MODULE) ever safe without being superfluous? I don't see how a
> module can put the last reference to itself without either hitting the above
> scenario, or without creating deadlock. Something other than the module must put

I think module_get/put(THIS_MODULE) is not responsible for guarding
this scenario. It just makes a section against module removal. Out of
this section, you still need other mechanisms to ensure no task run on
the module code before its module_exit() return. This is still true even
if no module_get/put(THIS_MODULE) is used.

Today with all your help, I can see the fops.owner = THIS_MODULE is an
efficient way to ensure no entry to file callbacks when module_exit() is
called.

> the last reference, no?
>
> The only exceptions I see are:
>
> 1. if module_put() is called via async_run_entry_fn(), as delete_module() invokes
> async_synchronize_full() before unmapping the module. But IIUC, the async
> framework uses workqueues, not the other way around. I.e. delete_module()
> doesn't flush arbitrary workqueues.
>
> 2. if module_put() is called via module_put_and_kthread_exit(), which uses
> THIS_MODULE but does module_put() from a core kernel helper and never returns
> to the module's kthread, i.e. doesn't return to module code.
>
> But then this
>
> $ git grep -E "module_put\(THIS_MODULE" | wc -l
> 132
>
> make me go "huh"? I've blamed a handful of those calls, and I can't find a single
> one that provides any clue as to why the module gets/puts references to itself,
> let alone actually justifies the usage.
>
> E.g. drivers/block/loop.c has this gem
>
> /* This is safe: open() is still holding a reference. */
> module_put(THIS_MODULE);
>
> in __loop_clr_fd(), which is invoked from a .release() function. So open() quite
> clearly doesn't hold a reference, unless the comment is talking about the reference
> that was obtained by the core file systems layer and won't be put until after
> .release() completes. But then what on earth is the point of doing
> module_get(THIS_MODULE) and module_put(THIS_MODULE)?

I see the comment that .release()->__loop_clr_fd() is only called when in
"autoclear mode". Maybe in some "manual mode", user should explicitly
call IOCTL(LOOP_CLR_FD) to allow module removal. That means in some
case, user called IOCTL(LOOP_CONFIGURE) and then closed all fds, but the
device state was not cleaned up, so that the driver module is not allowed to
be removed.

Thanks,
Yilun

>
>
> Back to KVM...
>
> Commit 5f6de5cbebee ("KVM: Prevent module exit until all VMs are freed") *tried*
> to fix a bug where KVM-the-module could be unloaded while a KVM workqueue callback
> was still in-flight. The callback had a reference to the VM, but not to the VM's
> file representation.
>
> After that commit went in, I suggested dropping the use of .owner for VMs and
> vCPUs (each of which is represented by an anon inode file) because keeping the VM
> alive would pin KVM-the-module until all VMs went away. But I missed the
> obvious-in-hindsight issue Al highlighted above.
>
> Fixing that particular wart is relatively easy: revert commit 70375c2d8fa3 ("Revert
> "KVM: set owner of cpu and vm file operations""), and give all of the other KVM-owned
> file_operations structures the same treatment by setting .owner correctly. Note,
> "correctly" isn't THIS_MODULE in most cases, which is why the code existing is a
> bit odd. For most file_operations, on x86 and PPC (and MIPS?), the effective owner
> is actually a sub-module, e.g. THIS_MODULE will point at kvm.ko, but on x86 the
> effective owner is either kvm-intel.ko or kvm-amd.ko (which holds a reference to
> kvm.ko).
>
> After staring and fiddling for most of today, I finally discovered that grabbing
> a reference to the module on behalf of the work item didn't fix the actual bugs,
> plural, it just shuffled the deck chairs on the Titanic. And as above, it set us
> up to make even bigger mistakes regarding .owner :-(
>
> The problematic code is effectively kvm_clear_async_pf_completion_queue(). That
> helper is called for each vCPU when a VM is being destroyed, i.e. when the last
> reference to a VM is put via kvm_put_kvm(). Clearing the queue *should* also
> flush all work items, except it doesn't when the work is "done", where "done" just
> means the page being faulted in is ready. Using file_operations.owner doesn't
> solve anything, e.g. even if async_pf_execute() were gifted a reference to the
> VM's file and used the deferred fput(), the same preemption issue exists, it's
> just slightly harder to hit.
>
> The original async #PF code appears to have fudged around the lack of flushing by
> gifting a VM reference to the async_pf_execute(). Or maybe it was the other way
> around and not flushing was a workaround for the deadlock that occurs if
> kvm_clear_async_pf_completion_queue() does flush the workqueue. If kvm_put_kvm()
> is called from async_pf_execute() and kvm_put_kvm() flushes the async #PF workqueue,
> deadlock occurs becase async_pf_execute() can't return until kvm_put_kvm() finishes,
> and kvm_put_kvm() can't return until async_pf_execute() finishes.
>
> WARNING: CPU: 8 PID: 251 at virt/kvm/kvm_main.c:1435 kvm_put_kvm+0x2d/0x320 [kvm]
> Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel kvm irqbypass
> CPU: 8 PID: 251 Comm: kworker/8:1 Tainted: G W 6.6.0-rc1-e7af8d17224a-x86/gmem-vm #119
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
> Workqueue: events async_pf_execute [kvm]
> RIP: 0010:kvm_put_kvm+0x2d/0x320 [kvm]
> Call Trace:
> <TASK>
> async_pf_execute+0x198/0x260 [kvm]
> process_one_work+0x145/0x2d0
> worker_thread+0x27e/0x3a0
> kthread+0xba/0xe0
> ret_from_fork+0x2d/0x50
> ret_from_fork_asm+0x11/0x20
> </TASK>
> ---[ end trace 0000000000000000 ]---
> INFO: task kworker/8:1:251 blocked for more than 120 seconds.
> Tainted: G W 6.6.0-rc1-e7af8d17224a-x86/gmem-vm #119
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:kworker/8:1 state:D stack:0 pid:251 ppid:2 flags:0x00004000
> Workqueue: events async_pf_execute [kvm]
> Call Trace:
> <TASK>
> __schedule+0x33f/0xa40
> schedule+0x53/0xc0
> schedule_timeout+0x12a/0x140
> __wait_for_common+0x8d/0x1d0
> __flush_work.isra.0+0x19f/0x2c0
> kvm_clear_async_pf_completion_queue+0x129/0x190 [kvm]
> kvm_arch_destroy_vm+0x78/0x1b0 [kvm]
> kvm_put_kvm+0x1c1/0x320 [kvm]
> async_pf_execute+0x198/0x260 [kvm]
> process_one_work+0x145/0x2d0
> worker_thread+0x27e/0x3a0
> kthread+0xba/0xe0
> ret_from_fork+0x2d/0x50
> ret_from_fork_asm+0x11/0x20
> </TASK>
>
> If kvm_clear_async_pf_completion_queue() actually flushes the workqueue, then
> there's no need to gift async_pf_execute() a reference because all invocations
> of async_pf_execute() will be forced to complete before the vCPU and its VM are
> destroyed/freed. And that also fixes the module unloading mess because __fput()
> won't do module_put() on the last vCPU reference until the vCPU has been freed.
>
> The attached patches are lightly tested, but I think they fix the KVM mess. I
> likely won't post a proper series until next week, I'm going to be offline the
> next two days.

> From 017fedee5608094f2e5535297443db7512a213b8 Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <[email protected]>
> Date: Tue, 10 Oct 2023 11:42:32 -0700
> Subject: [PATCH 1/3] KVM: Set file_operations.owner appropriately for all such
> structures
>
> This reverts commit 70375c2d8fa3fb9b0b59207a9c5df1e2e1205c10, and gives
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> arch/x86/kvm/debugfs.c | 1 +
> virt/kvm/kvm_main.c | 11 ++++++++---
> 2 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
> index ee8c4c3496ed..eea6ea7f14af 100644
> --- a/arch/x86/kvm/debugfs.c
> +++ b/arch/x86/kvm/debugfs.c
> @@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
> }
>
> static const struct file_operations mmu_rmaps_stat_fops = {
> + .owner = THIS_MODULE,
> .open = kvm_mmu_rmaps_stat_open,
> .read = seq_read,
> .llseek = seq_lseek,
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 486800a7024b..1e65a506985f 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3887,7 +3887,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
> return 0;
> }
>
> -static const struct file_operations kvm_vcpu_fops = {
> +static struct file_operations kvm_vcpu_fops = {
> .release = kvm_vcpu_release,
> .unlocked_ioctl = kvm_vcpu_ioctl,
> .mmap = kvm_vcpu_mmap,
> @@ -4081,6 +4081,7 @@ static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
> }
>
> static const struct file_operations kvm_vcpu_stats_fops = {
> + .owner = THIS_MODULE,
> .read = kvm_vcpu_stats_read,
> .release = kvm_vcpu_stats_release,
> .llseek = noop_llseek,
> @@ -4431,7 +4432,7 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
> return 0;
> }
>
> -static const struct file_operations kvm_device_fops = {
> +static struct file_operations kvm_device_fops = {
> .unlocked_ioctl = kvm_device_ioctl,
> .release = kvm_device_release,
> KVM_COMPAT(kvm_device_ioctl),
> @@ -4759,6 +4760,7 @@ static int kvm_vm_stats_release(struct inode *inode, struct file *file)
> }
>
> static const struct file_operations kvm_vm_stats_fops = {
> + .owner = THIS_MODULE,
> .read = kvm_vm_stats_read,
> .release = kvm_vm_stats_release,
> .llseek = noop_llseek,
> @@ -5060,7 +5062,7 @@ static long kvm_vm_compat_ioctl(struct file *filp,
> }
> #endif
>
> -static const struct file_operations kvm_vm_fops = {
> +static struct file_operations kvm_vm_fops = {
> .release = kvm_vm_release,
> .unlocked_ioctl = kvm_vm_ioctl,
> .llseek = noop_llseek,
> @@ -6095,6 +6097,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
> goto err_async_pf;
>
> kvm_chardev_ops.owner = module;
> + kvm_vm_fops.owner = module;
> + kvm_vcpu_fops.owner = module;
> + kvm_device_fops.owner = module;
>
> kvm_preempt_ops.sched_in = kvm_sched_in;
> kvm_preempt_ops.sched_out = kvm_sched_out;
>
> base-commit: dfdc8b7884b50e3bfa635292973b530a97689f12
> --
> 2.42.0.609.gbb76f46606-goog
>

> From f5be42f3be9967a0591051a7c8d73cac2c0a072b Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <[email protected]>
> Date: Tue, 10 Oct 2023 13:42:13 -0700
> Subject: [PATCH 2/3] KVM: Always flush async #PF workqueue when vCPU is being
> destroyed
>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> virt/kvm/async_pf.c | 15 ++++++++++++---
> 1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index e033c79d528e..7aeb9d1f43b1 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -87,7 +87,6 @@ static void async_pf_execute(struct work_struct *work)
> __kvm_vcpu_wake_up(vcpu);
>
> mmput(mm);
> - kvm_put_kvm(vcpu->kvm);
> }
>
> void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
> @@ -114,7 +113,6 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
> #else
> if (cancel_work_sync(&work->work)) {
> mmput(work->mm);
> - kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
> kmem_cache_free(async_pf_cache, work);
> }
> #endif
> @@ -126,7 +124,19 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
> list_first_entry(&vcpu->async_pf.done,
> typeof(*work), link);
> list_del(&work->link);
> +
> + spin_unlock(&vcpu->async_pf.lock);
> +
> + /*
> + * The async #PF is "done", but KVM must wait for the work item
> + * itself, i.e. async_pf_execute(), to run to completion. If
> + * KVM is a module, KVM must ensure *no* code owned by the KVM
> + * (the module) can be run after the last call to module_put(),
> + * i.e. after the last reference to the last vCPU's file is put.
> + */
> + flush_work(&work->work);
> kmem_cache_free(async_pf_cache, work);
> + spin_lock(&vcpu->async_pf.lock);
> }
> spin_unlock(&vcpu->async_pf.lock);
>
> @@ -186,7 +196,6 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> work->arch = *arch;
> work->mm = current->mm;
> mmget(work->mm);
> - kvm_get_kvm(work->vcpu->kvm);
>
> INIT_WORK(&work->work, async_pf_execute);
>
> --
> 2.42.0.609.gbb76f46606-goog
>

> From 0a4238f027e41c64afa2919440420ea56c0cae80 Mon Sep 17 00:00:00 2001
> From: Sean Christopherson <[email protected]>
> Date: Tue, 10 Oct 2023 15:09:43 -0700
> Subject: [PATCH 3/3] Revert "KVM: Prevent module exit until all VMs are freed"
>
> Revert KVM's misguided attempt to "fix" a use-after-module-unload bug that
> was actually due to failure to flush a workqueue, not a lack of module
> refcounting.
>
> blah blah blah
>
> This reverts commit 405294f29faee5de8c10cb9d4a90e229c2835279 and commit
> commit 5f6de5cbebee925a612856fce6f9182bb3eee0db.
>
> Fixes: 405294f29fae ("KVM: Unconditionally get a ref to /dev/kvm module when creating a VM")
> Fixes: 5f6de5cbebee ("KVM: Prevent module exit until all VMs are freed")
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> virt/kvm/kvm_main.c | 7 -------
> 1 file changed, 7 deletions(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1e65a506985f..3b1b9e8dd70c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -115,8 +115,6 @@ EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
>
> static const struct file_operations stat_fops_per_vm;
>
> -static struct file_operations kvm_chardev_ops;
> -
> static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
> unsigned long arg);
> #ifdef CONFIG_KVM_COMPAT
> @@ -1157,9 +1155,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> if (!kvm)
> return ERR_PTR(-ENOMEM);
>
> - /* KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). */
> - __module_get(kvm_chardev_ops.owner);
> -
> KVM_MMU_LOCK_INIT(kvm);
> mmgrab(current->mm);
> kvm->mm = current->mm;
> @@ -1279,7 +1274,6 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
> out_err_no_srcu:
> kvm_arch_free_vm(kvm);
> mmdrop(current->mm);
> - module_put(kvm_chardev_ops.owner);
> return ERR_PTR(r);
> }
>
> @@ -1348,7 +1342,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
> preempt_notifier_dec();
> hardware_disable_all();
> mmdrop(mm);
> - module_put(kvm_chardev_ops.owner);
> }
>
> void kvm_get_kvm(struct kvm *kvm)
> --
> 2.42.0.609.gbb76f46606-goog
>

2023-10-11 17:58:54

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On 10/11/23 01:30, Sean Christopherson wrote:
>> E.g. drivers/block/loop.c has this gem
>>
>> /* This is safe: open() is still holding a reference. */
>> module_put(THIS_MODULE);
>
> in __loop_clr_fd(), which is invoked from a .release() function. So open() quite
> clearly doesn't hold a reference, unless the comment is talking about the reference
> that was obtained by the core file systems layer and won't be put until after
> .release() completes. But then what on earth is the point of doing
> module_get(THIS_MODULE) and module_put(THIS_MODULE)?

Here the module_put() is called not just from .release() in autoclear
mode, but also from the LOOP_CLR_FD ioctl.

So the idea here is that while a /dev/loopN device exists you must keep
the module alive, to ensure that the devices don't disappear from /dev.
So the point here is to keep the module alive after /dev/loop-control
has been closed; but if /dev/loopN is open it will keep the module alive
on its own, and this makes module_get/module_put safe in this particular
case.

In general, safety is guaranteed if module_put is called while the
module's reference count is still elevated by someone else, which could
be a file descriptor or some core subsystem. But then you're right that
in many case there seems to be no point in doing module_get/module_put.
In drivers/watchdog/softdog.c, softdog_stop() is called while the
module's reference count is still elevated by the core watchdog code,
which makes the code safe. But why does softdog.c need to have its own
reference? Any attempt to unregister the softdog module will go through
hrtimer_cancel(&softdog_ticktock) - which waits for the timer callback
to be complete, just like flush_work() in your patch.

This module_get/module_put _is_ unnecessary. It was introduced as a
bandaid in commit 5889f06bd31d ("watchdog: refuse to unload softdog
module when its timer is running", 2015-12-27). Back then the core code
wasn't careful to keep the module refcount elevated if the watchdog was
still running in watchdog_release. When commit ee142889e32f ("watchdog:
Introduce WDOG_HW_RUNNING flag", 2016-03-16) fixed the race for real,
softdog.c wasn't adjusted.

I agree that in many cases, however, the safety does not seem to be
there. I cannot find a place that ensures that snd-aoa-soundbus-i2sbus
is kept alive while i2sbus_private_free runs, for example (though that
code is a maze).

Your patch 2 looks good, but perhaps instead of setting the owner we
could stash the struct module* in a global, and try_get/put it from open
and release respectively? That is, .owner keeps the kvm module alive
and the kvm module keeps kvm-intel/kvm-amd alive. That would subsume
patches 1 and 3.

Paolo

2023-10-13 16:55:55

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On Wed, Oct 11, 2023, Paolo Bonzini wrote:
> Your patch 2 looks good, but perhaps instead of setting the owner we could
> stash the struct module* in a global, and try_get/put it from open and
> release respectively? That is, .owner keeps the kvm module alive and the
> kvm module keeps kvm-intel/kvm-amd alive. That would subsume patches 1 and 3.

I don't think that would be a net positive. We'd have to implement .open() for
several file types just to get a reference to the sub-module. At that point, the
odds of forgetting to implement .open() are about the same as forgetting to set
.owner when adding a new file type, e.g. guest_memfd.

2023-10-16 15:21:18

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH gmem FIXUP] kvm: guestmem: do not use a file system

On 10/13/23 18:55, Sean Christopherson wrote:
> On Wed, Oct 11, 2023, Paolo Bonzini wrote:
>> Your patch 2 looks good, but perhaps instead of setting the owner we could
>> stash the struct module* in a global, and try_get/put it from open and
>> release respectively? That is, .owner keeps the kvm module alive and the
>> kvm module keeps kvm-intel/kvm-amd alive. That would subsume patches 1 and 3.
>
> I don't think that would be a net positive. We'd have to implement .open() for
> several file types just to get a reference to the sub-module. At that point, the
> odds of forgetting to implement .open() are about the same as forgetting to set
> .owner when adding a new file type, e.g. guest_memfd.

Fair enough, it's true that many fops don't have a .open callback already.

Paolo