Events that poll() responds to are supposed to be consumed when the file
is read(), not by the poll() itself. By putting it on the poll() itself,
it makes it impossible to poll() on a epoll file descriptor, since the
event gets consumed too early. Jann wrote a PoC, available in the link
below.
Reported-by: Jann Horn <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Luis Chamberlain <[email protected]>
Cc: [email protected]
Link: https://lore.kernel.org/lkml/CAG48ez1F0P7Wnp=PGhiUej=u=8CSF6gpD9J=Oxxg0buFRqV1tA@mail.gmail.com/
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
fs/proc/proc_sysctl.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 7d9cfc730bd4..1aa145794207 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -622,6 +622,14 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ if (!IS_ERR(head) && table->poll)
+ iocb->ki_filp->private_data = proc_sys_poll_event(table->poll);
+ sysctl_head_finish(head);
+
return proc_sys_call_handler(iocb, iter, 0);
}
@@ -668,10 +676,8 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
event = (unsigned long)filp->private_data;
poll_wait(filp, &table->poll->wait, wait);
- if (event != atomic_read(&table->poll->event)) {
- filp->private_data = proc_sys_poll_event(table->poll);
+ if (event != atomic_read(&table->poll->event))
ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
- }
out:
sysctl_head_finish(head);
--
2.35.1
In order to inform userspace of virtual machine forks, this commit adds
a "fork_event" sysctl, which does not return any data, but allows
userspace processes to poll() on it for notification of VM forks.
It avoids exposing the actual vmgenid from the hypervisor to userspace,
in case there is any randomness value in keeping it secret. Rather,
userspace is expected to simply use getrandom() if it wants a fresh
value.
For example, the following snippet can be used to print a message every
time a VM forks, after the RNG has been reseeded:
struct pollfd fd = { .fd = open("/proc/sys/kernel/random/fork_event", O_RDONLY) };
assert(fd.fd >= 0);
for (;;) {
read(fd.fd, NULL, 0);
assert(poll(&fd, 1, -1) > 0);
puts("vm fork detected");
}
Various programs and libraries that utilize cryptographic operations
depending on fresh randomness can invalidate old keys or take other
appropriate actions when receiving that event. While this is racier than
allowing userspace to mmap/vDSO the vmgenid itself, it's an incremental
step forward that's not as heavyweight.
Cc: Dominik Brodowski <[email protected]>
Cc: Greg Kroah-Hartman <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Cc: Alexander Graf <[email protected]>
Cc: Colm MacCarthaigh <[email protected]>
Cc: Torben Hansen <[email protected]>
Cc: Jann Horn <[email protected]>
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
Documentation/admin-guide/sysctl/kernel.rst | 6 ++++--
drivers/char/random.c | 24 +++++++++++++++++++++
2 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 1144ea3229a3..ddbd603f0be7 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1001,7 +1001,7 @@ This is a directory, with the following entries:
* ``urandom_min_reseed_secs``: obsolete (used to determine the minimum
number of seconds between urandom pool reseeding). This file is
writable for compatibility purposes, but writing to it has no effect
- on any RNG behavior.
+ on any RNG behavior;
* ``uuid``: a UUID generated every time this is retrieved (this can
thus be used to generate UUIDs at will);
@@ -1009,8 +1009,10 @@ This is a directory, with the following entries:
* ``write_wakeup_threshold``: when the entropy count drops below this
(as a number of bits), processes waiting to write to ``/dev/random``
are woken up. This file is writable for compatibility purposes, but
- writing to it has no effect on any RNG behavior.
+ writing to it has no effect on any RNG behavior;
+* ``fork_event``: unreadable, but can be poll()'d on for notifications
+ delivered after the RNG reseeds following a virtual machine fork.
randomize_va_space
==================
diff --git a/drivers/char/random.c b/drivers/char/random.c
index bffc8682d6b8..39eda91b07ec 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1213,6 +1213,9 @@ EXPORT_SYMBOL_GPL(add_bootloader_randomness);
#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);
+#ifdef CONFIG_SYSCTL
+static DEFINE_CTL_TABLE_POLL(sysctl_fork_event_poll);
+#endif
/*
* Handle a new unique VM ID, which is unique, not secret, so we
@@ -1227,6 +1230,9 @@ void add_vmfork_randomness(const void *unique_vm_id, size_t size)
pr_notice("crng reseeded due to virtual machine fork\n");
}
blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
+#ifdef CONFIG_SYSCTL
+ proc_sys_poll_notify(&sysctl_fork_event_poll);
+#endif
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
@@ -1694,6 +1700,8 @@ const struct file_operations urandom_fops = {
* It is writable to avoid breaking old userspaces, but writing
* to it does not change any behavior of the RNG.
*
+ * - fork_event - an unreadable file that can be poll()'d on for VM forks.
+ *
********************************************************************/
#ifdef CONFIG_SYSCTL
@@ -1747,6 +1755,14 @@ static int proc_do_rointvec(struct ctl_table *table, int write, void *buffer,
return write ? 0 : proc_dointvec(table, 0, buffer, lenp, ppos);
}
+#if IS_ENABLED(CONFIG_VMGENID)
+static int proc_do_nodata(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return -ENODATA;
+}
+#endif
+
static struct ctl_table random_table[] = {
{
.procname = "poolsize",
@@ -1787,6 +1803,14 @@ static struct ctl_table random_table[] = {
.mode = 0444,
.proc_handler = proc_do_uuid,
},
+#if IS_ENABLED(CONFIG_VMGENID)
+ {
+ .procname = "fork_event",
+ .mode = 0444,
+ .poll = &sysctl_fork_event_poll,
+ .proc_handler = proc_do_nodata,
+ },
+#endif
{ }
};
--
2.35.1
Hey Lennart,
On Tue, May 03, 2022 at 09:42:40AM +0200, Lennart Poettering wrote:
> For this MAC address usecase it's entirely sufficient to be able to
> distinguish if the system was closed at all, i.e. if the counter is
> zero or is non-zero. Because that would already be great for a policy
> of "hash it in a stable way from /etc/machine-id, if counter == 0" +
> "use random MAC once counter > 0".
Hm, are you sure that's actually what you want? It turns out this
vmgenid notification from the hypervisor might not be sufficiently
granular for this use case:
- vmgenid changes when you fork a new snapshot, so now you have two VMs
- vmgenid also changes when you rewind to 2 minutes ago
The first is what I assume you care about for this networkd business.
The second is probably not what any networkd user expects.
[Aside: I hope there are few networkd users; having seen what Yu did
with wireguard and how fast and recklessly that went, I can't recommend
that part of systemd to anyone.]
From the perspective of randomness, both of these events imply the same
thing. The situation is BAD; reseed immediately. From the perspective of
MAC addresses, though, these events would imply different behavior,
right? So it seems like vmgenid might need an additional field for this
use case. Relatedly, VMware has that prompt where you select about your
VM whether, "I moved it" or "I copied it." Presumably something like
that would play a part in what is decided as part of this hypothetical
second field.
Let me know if this seems right to you, or if actually you had in mind
changing MAC addresses in both cases instead.
Jason
On Tue, May 03, 2022 at 11:32:26AM +0200, Lennart Poettering wrote:
> So first of all, it appears to me that rewinding a VM is something people
> would do for debugging things, i.e. not how things are done on
> deployment.
I wouldn't be so sure here... Some people have processes around, "always
start out from the same place", like for build machines, and employ a
single VM snapshot that's always rewound after use. It's never forked
into multiple snapshots, but just always goes back to that same starting
point.
> > >From the perspective of randomness, both of these events imply the same
> > thing. The situation is BAD; reseed immediately. From the perspective of
> > MAC addresses, though, these events would imply different behavior,
> > right? So it seems like vmgenid might need an additional field for this
> > use case. Relatedly, VMware has that prompt where you select about your
> > VM whether, "I moved it" or "I copied it." Presumably something like
> > that would play a part in what is decided as part of this hypothetical
> > second field.
>
> networkd doesn't change MAC addresses in the middle of everything, but
> only when a network interface is downed and upped again. This for
> example happens when a link beat goes away and comes back. In the
> rewind-2min case i'd assume the link beat would probably be restored
> to what it was 2min ago (and thus stay online), but in the clone case
> it would probably drop momentarily and be restored than, to tell
> software to reacquire dhcp and so on.
That sounds like it's going to be sort of confusing. Let's say we've got
some VM scenario in which rewinds are common due to whatever weird
process (such as a build machine that wants to start out at the same
place each time). During its course of execution, it reboots, or maybe
there's some network connectivity issue and the link goes down. In that
case, when the link comes up, it's going to have a different MAC
address? That doesn't make much sense to me, but maybe I'm missing some
bigger picture detail.
Jason
On Di, 03.05.22 12:07, Jason A. Donenfeld ([email protected]) wrote:
> I wouldn't be so sure here... Some people have processes around, "always
> start out from the same place", like for build machines, and employ a
> single VM snapshot that's always rewound after use. It's never forked
> into multiple snapshots, but just always goes back to that same starting
> point.
At sometimes it's futile phantasizing which exotic usecases people
might have.
> > > >From the perspective of randomness, both of these events imply the same
> > > thing. The situation is BAD; reseed immediately. From the perspective of
> > > MAC addresses, though, these events would imply different behavior,
> > > right? So it seems like vmgenid might need an additional field for this
> > > use case. Relatedly, VMware has that prompt where you select about your
> > > VM whether, "I moved it" or "I copied it." Presumably something like
> > > that would play a part in what is decided as part of this hypothetical
> > > second field.
> >
> > networkd doesn't change MAC addresses in the middle of everything, but
> > only when a network interface is downed and upped again. This for
> > example happens when a link beat goes away and comes back. In the
> > rewind-2min case i'd assume the link beat would probably be restored
> > to what it was 2min ago (and thus stay online), but in the clone case
> > it would probably drop momentarily and be restored than, to tell
> > software to reacquire dhcp and so on.
>
> That sounds like it's going to be sort of confusing. Let's say we've got
> some VM scenario in which rewinds are common due to whatever weird
> process (such as a build machine that wants to start out at the same
> place each time). During its course of execution, it reboots, or maybe
> there's some network connectivity issue and the link goes down. In that
> case, when the link comes up, it's going to have a different MAC
> address? That doesn't make much sense to me, but maybe I'm missing some
> bigger picture detail.
It's still better than sticking to the same MAC address for all clones
in all cases...
Dunno, in systemd the MAC address policies are configurable, for a
reason. We'll never find a default that really makes everyone
happy. Some people prefer the anonmymity of randomized MAC addresses,
others like the stability promises of hashed MAC addresses. We support
both policies. I think it would make sense to add a policy that says
"stable MAC until the first clone, then random" for example. In fact
I think it's a choice that has the chance of being a better default
than the current "always stable" approach we employ. So at the very
least we should have the option to come up with a policy taking vm
generations into account, it's a separate discussion to decide whether
to make it opt-in or the default then, and I doubt that part of the
discussion really matters here...
Lennart
--
Lennart Poettering, Berlin
Hi Simo,
On Tue, May 10, 2022 at 08:40:48PM -0400, Simo Sorce wrote:
> At your request teleporting here the answer I gave on a different
> thread, reinforced by some thinking.
>
> As a user space crypto library person I think the only reasonable
> interface is something like a vDSO.
>
> Poll() interfaces are nice and all for system programs that have full
> control of their event loop and do not have to react immediately to
> this event, however crypto libraries do not have the luxury of
> controlling the main loop of the application.
>
> Additionally crypto libraries really need to ensure the value they
> return from their PRNG is fine, which means they do not return a value
> if the vmgenid has changed before they can reseed, or there could be
> catastrophic duplication of "random" values used in IVs or ECDSA
> Signatures or ids/cookies or whatever.
>
> For crypto libraries it is much simpler to poll for this information
> than using notifications of any kind given libraries are
> generally not in full control of what the process does.
>
> This needs to be polled fast as well, because the whole point of
> initializing a PRNG in the library is that asking /dev/urandom all the
> time is too slow (due to context switches and syscall overhead), so
> anything that would require a context switch in order to pull data from
> the PRNG would not really fly.
>
> A vDSO or similar would allow to pull the vmgenid or whatever epoch
> value in before generating the random numbers and then barrier-style
> check that the value is still unchanged before returning the random
> data to the caller. This will reduce the race condition (which simply
> cannot be completely avoided) to a very unlikely event.
It sounds like your library issue is somewhat similar to what Alex was
talking about with regards to having a hard time using poll in s2n. I'm
still waiting to hear if Amazon figured out some way that this is
possible (with, e.g., a thread). But anyway, it seems like this is
something library authors might hit.
My proposal here is made with nonce reuse in mind, for things like
session keys that use sequential nonces.
A different issue is random nonces. For these, it seems like a call to
getrandom() for each nonce is probably the best bet. But it sounds like
you're interested in a userspace RNG, akin to OpenBSD's arc4random(3). I
hope you saw these threads:
- https://lore.kernel.org/lkml/[email protected]/
- https://lore.kernel.org/lkml/[email protected]/
- https://lore.kernel.org/lkml/CAHmME9qHGSF8w3DoyCP+ud_N0MAJ5_8zsUWx=rxQB1mFnGcu9w@mail.gmail.com/
Each one of those touches on vDSO things quite a bit. Basically, the
motivation for doing that is for making userspace RNGs safe and
promoting their use with a variety of kernel enhancements to make that
easy. And IF we are to ship a vDSO RNG, then certainly this vmgenid
business should be exposed that way, over and above other mechanisms.
It'd make the most sense...IF we're going to ship a vDSO RNG.
So the question really is: should we ship a vDSO RNG? I could work on
designing that right. But I'm a little bit skeptical generally of the
whole userspace RNG concept. By and large they always turn out to be
less safe and more complex than the kernel one. So if we're to go that
way, I'd like to understand what the strongest arguments for it are.
Jason
Hi Eric,
On 5/12/22, Eric W. Biederman <[email protected]> wrote:
> Luis Chamberlain <[email protected]> writes:
>
>> On Tue, May 03, 2022 at 01:27:44PM +0200, Jason A. Donenfeld wrote:
>>> On Mon, May 02, 2022 at 05:43:21PM +0200, Lennart Poettering wrote:
>>> > On Mo, 02.05.22 17:30, Jason A. Donenfeld ([email protected]) wrote:
>>> >
>>> > > Just wanted to double check with you that this change wouldn't break
>>> > > how
>>> > > you're using it in systemd for /proc/sys/kernel/hostname:
>>> > >
>>> > > https://github.com/systemd/systemd/blob/39cd62c30c2e6bb5ec13ebc1ecf0d37ed015b1b8/src/journal/journald-server.c#L1832
>>> > > https://github.com/systemd/systemd/blob/39cd62c30c2e6bb5ec13ebc1ecf0d37ed015b1b8/src/resolve/resolved-manager.c#L465
>>> > >
>>> > > I couldn't find anybody else actually polling on it. Interestingly,
>>> > > it
>>> > > looks like sd_event_add_io uses epoll() inside, but you're not
>>> > > hitting
>>> > > the bug that Jann pointed out (because I suppose you're not poll()ing
>>> > > on
>>> > > an epoll fd).
>>> >
>>> > Well, if you made sure this still works, I am fine either way ;-)
>>>
>>> Actually... ugh. It doesn't work. systemd uses uname() to read the host
>>> name, and doesn't actually read() the file descriptor after receiving
>>> the poll event on it. So I guess I'll forget this, and maybe we'll have
>>> to live with sysctl's poll() being broken. :(
>
> We should be able to modify calling uname() to act the same as reading
> the file descriptor.
How? That sounds like madness. read() takes a fd. uname() doesn't. Are
you proposing we walk through the fds of the process calling uname()
til we find a matching one and then twiddle it's private context
state? I mean I guess that'd work, but...
Jason
On Mon, May 02, 2022 at 04:06:01PM +0200, Jason A. Donenfeld wrote:
>Events that poll() responds to are supposed to be consumed when the file
>is read(), not by the poll() itself. By putting it on the poll() itself,
>it makes it impossible to poll() on a epoll file descriptor, since the
>event gets consumed too early. Jann wrote a PoC, available in the link
>below.
>
>Reported-by: Jann Horn <[email protected]>
>Cc: Kees Cook <[email protected]>
>Cc: Luis Chamberlain <[email protected]>
>Cc: [email protected]
>Link: https://lore.kernel.org/lkml/CAG48ez1F0P7Wnp=PGhiUej=u=8CSF6gpD9J=Oxxg0buFRqV1tA@mail.gmail.com/
>Signed-off-by: Jason A. Donenfeld <[email protected]>
It seems to be my bug. This is indeed better. Also, I don't think it's unsafe
to fix it like this neither. If my memory serves (it's what, 10+ years?), this
was only tested and used with poll(), which will continue to work.
There were plans to use it in one of systemd's tools, in which case we'd
probably notice the misbehavior with epoll().... humn, checking now systemd's
codebase:
static int on_hostname_change(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
...
log_info("System hostname changed to '%s'.", full_hostname);
...
}
static int manager_watch_hostname(Manager *m) {
int r;
assert(m);
m->hostname_fd = open("/proc/sys/kernel/hostname",
O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
if (m->hostname_fd < 0) {
log_warning_errno(errno, "Failed to watch hostname: %m");
return 0;
}
r = sd_event_add_io(m->event, &m->hostname_event_source, m->hostname_fd, 0, on_hostname_change, m);
if (r < 0) {
if (r == -EPERM)
/* kernels prior to 3.2 don't support polling this file. Ignore the failure. */
m->hostname_fd = safe_close(m->hostname_fd);
else
return log_error_errno(r, "Failed to add hostname event source: %m");
}
....
}
and sd_event library uses epoll. So, it's apparently not working and it doesn't
seem to be their intention to rely on the misbehavior. This makes me think it
even deserves a Cc to stable.
Reviewed-by: Lucas De Marchi <[email protected]>
Lucas De Marchi
>---
> fs/proc/proc_sysctl.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
>diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
>index 7d9cfc730bd4..1aa145794207 100644
>--- a/fs/proc/proc_sysctl.c
>+++ b/fs/proc/proc_sysctl.c
>@@ -622,6 +622,14 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter,
>
> static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter)
> {
>+ struct inode *inode = file_inode(iocb->ki_filp);
>+ struct ctl_table_header *head = grab_header(inode);
>+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
>+
>+ if (!IS_ERR(head) && table->poll)
>+ iocb->ki_filp->private_data = proc_sys_poll_event(table->poll);
>+ sysctl_head_finish(head);
>+
> return proc_sys_call_handler(iocb, iter, 0);
> }
>
>@@ -668,10 +676,8 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
> event = (unsigned long)filp->private_data;
> poll_wait(filp, &table->poll->wait, wait);
>
>- if (event != atomic_read(&table->poll->event)) {
>- filp->private_data = proc_sys_poll_event(table->poll);
>+ if (event != atomic_read(&table->poll->event))
> ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
>- }
>
> out:
> sysctl_head_finish(head);
>--
>2.35.1
>