posix_timer_add() tries to allocate a posix timer ID by starting from the
cached ID which was stored by the last successful allocation.
This is done in a loop searching the ID space for a free slot one by
one. The loop has to terminate when the search wrapped around to the
starting point.
But that's racy vs. establishing the starting point. That is read out
lockless, which leads to the following problem:
CPU0 CPU1
posix_timer_add()
start = sig->posix_timer_id;
lock(hash_lock);
... posix_timer_add()
if (++sig->posix_timer_id < 0)
start = sig->posix_timer_id;
sig->posix_timer_id = 0;
So CPU1 can observe a negative start value, i.e. -1, and the loop break
never happens because the condition can never be true:
if (sig->posix_timer_id == start)
break;
While this is unlikely to ever turn into an endless loop as the ID space is
huge (INT_MAX), the racy read of the start value caught the attention of
KCSAN and Dmitry unearthed that incorrectness.
Rewrite it so that the start condition can never observe the negative value
and annotate the read and the write with READ_ONCE()/WRITE_ONCE().
Reported-by: [email protected]
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 30 +++++++++++++++++-------------
2 files changed, 18 insertions(+), 14 deletions(-)
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -135,7 +135,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,29 @@ static struct k_itimer *posix_timer_by_i
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int start, id;
- do {
+ /* Can be written by a different task concurrently in the loop below */
+ start = READ_ONCE(sig->next_posix_timer_id);
+
+ for (id = ~start; start != id; id++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX);
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
On Tue, Apr 25, 2023 at 08:48:58PM +0200, Thomas Gleixner wrote:
> posix_timer_add() tries to allocate a posix timer ID by starting from the
> cached ID which was stored by the last successful allocation.
>
> This is done in a loop searching the ID space for a free slot one by
> one. The loop has to terminate when the search wrapped around to the
> starting point.
>
> But that's racy vs. establishing the starting point. That is read out
> lockless, which leads to the following problem:
>
> CPU0 CPU1
> posix_timer_add()
> start = sig->posix_timer_id;
> lock(hash_lock);
> ... posix_timer_add()
> if (++sig->posix_timer_id < 0)
> start = sig->posix_timer_id;
> sig->posix_timer_id = 0;
>
> So CPU1 can observe a negative start value, i.e. -1, and the loop break
> never happens because the condition can never be true:
>
> if (sig->posix_timer_id == start)
> break;
>
> While this is unlikely to ever turn into an endless loop as the ID space is
> huge (INT_MAX), the racy read of the start value caught the attention of
> KCSAN and Dmitry unearthed that incorrectness.
>
> Rewrite it so that the start condition can never observe the negative value
> and annotate the read and the write with READ_ONCE()/WRITE_ONCE().
>
> Reported-by: [email protected]
> Reported-by: Dmitry Vyukov <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> include/linux/sched/signal.h | 2 +-
> kernel/time/posix-timers.c | 30 +++++++++++++++++-------------
> 2 files changed, 18 insertions(+), 14 deletions(-)
>
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -135,7 +135,7 @@ struct signal_struct {
> #ifdef CONFIG_POSIX_TIMERS
>
> /* POSIX.1b Interval Timers */
> - int posix_timer_id;
> + unsigned int next_posix_timer_id;
> struct list_head posix_timers;
>
> /* ITIMER_REAL timer for the process */
> --- a/kernel/time/posix-timers.c
> +++ b/kernel/time/posix-timers.c
> @@ -140,25 +140,29 @@ static struct k_itimer *posix_timer_by_i
> static int posix_timer_add(struct k_itimer *timer)
> {
> struct signal_struct *sig = current->signal;
> - int first_free_id = sig->posix_timer_id;
> struct hlist_head *head;
> - int ret = -ENOENT;
> + unsigned int start, id;
>
> - do {
> + /* Can be written by a different task concurrently in the loop below */
> + start = READ_ONCE(sig->next_posix_timer_id);
> +
> + for (id = ~start; start != id; id++) {
> spin_lock(&hash_lock);
> - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
> - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
> + id = sig->next_posix_timer_id;
> +
> + /* Write the next ID back. Clamp it to the positive space */
> + WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX);
Isn't that looping forever?
Thanks.
On Sat, May 06 2023 at 00:58, Thomas Gleixner wrote:
> On Fri, May 05 2023 at 16:50, Frederic Weisbecker wrote:
> So the whole thing works like this:
>
> start = READ_LOCKLESS(sig->next_id);
>
> // Enfore that id and start are different to not terminate right away
> id = ~start;
>
> loop:
> if (id == start)
> goto fail;
> lock()
> id = sig->next_id; <-- stable readout
> sig->next_id = (id + 1) & INT_MAX; <-- prevent going negative
>
> if (unused_id(id)) {
> add_timer_to_hash(timer, id);
> unlock();
> return id;
> }
> id++;
> unlock();
> goto loop;
>
> As the initial lockless readout is guaranteed to be in the positive
> space, how is that supposed to be looping forever?
Unless you think about the theoretical case of an unlimited number of
threads sharing the signal_struct which all concurrently try to allocate
a timer id and then releasing it immediately again (to avoid resource
limit exhaustion). Theoretically possible, but is this a real concern
with a timer ID space of 2G?
I'm sure that it's incredibly hard to exploit this, but what's really
bothering me is the hash table itself. The only reason why we have that
is CRIU.
The only alternative solution I could come up with is a paritioned
xarray where the index space would be segmented for each TGID, i.e.
segment.start = TGID * MAX_TIMERS_PER_PROCESS
segment.end = segment.start + MAX_TIMERS_PER_PROCESS - 1
where MAX_TIMERS_PER_PROCESS could be a copius 2^16 which would work for
both 32bit and 64bit TID limits.
That would avoid the hash table lookups and the related issues, but OTH
it would require to allocate one extra page per TGID if the application
uses a single posix timer.
Not sure whether that's worth it though.
Thanks,
tglx
On Fri, May 05 2023 at 16:50, Frederic Weisbecker wrote:
> On Tue, Apr 25, 2023 at 08:48:58PM +0200, Thomas Gleixner wrote:
>>
>> - do {
>> + /* Can be written by a different task concurrently in the loop below */
>> + start = READ_ONCE(sig->next_posix_timer_id);
>> +
>> + for (id = ~start; start != id; id++) {
>> spin_lock(&hash_lock);
>> - head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
>> - if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
>> + id = sig->next_posix_timer_id;
>> +
>> + /* Write the next ID back. Clamp it to the positive space */
>> + WRITE_ONCE(sig->next_posix_timer_id, (id + 1) & INT_MAX);
>
> Isn't that looping forever?
No. The loop breaks when @id reaches the locklessly read out @start
value again.
I admit that the 'id = ~start' part in the for() expression is confusing
without a comment. That initial @id value is in the invalid space to
make sure that the termination condition 'start != id' does not trigger
right away. But that value gets immediately overwritten after acquiring
hash_lock by the real sig->next_posix_timer_id value.
The clamp to the positive space has nothing to do with that. That's
required because the ID must be positive as a negative value would be an
error when returned, right?
So the whole thing works like this:
start = READ_LOCKLESS(sig->next_id);
// Enfore that id and start are different to not terminate right away
id = ~start;
loop:
if (id == start)
goto fail;
lock()
id = sig->next_id; <-- stable readout
sig->next_id = (id + 1) & INT_MAX; <-- prevent going negative
if (unused_id(id)) {
add_timer_to_hash(timer, id);
unlock();
return id;
}
id++;
unlock();
goto loop;
As the initial lockless readout is guaranteed to be in the positive
space, how is that supposed to be looping forever?
Admittedly this can be written less obscure, but not tonight :)
Thanks,
tglx
On Sat, May 06 2023 at 01:36, Thomas Gleixner wrote:
> On Sat, May 06 2023 at 00:58, Thomas Gleixner wrote:
>> On Fri, May 05 2023 at 16:50, Frederic Weisbecker wrote:
>> As the initial lockless readout is guaranteed to be in the positive
>> space, how is that supposed to be looping forever?
>
> Unless you think about the theoretical case of an unlimited number of
> threads sharing the signal_struct which all concurrently try to allocate
> a timer id and then releasing it immediately again (to avoid resource
> limit exhaustion). Theoretically possible, but is this a real concern
> with a timer ID space of 2G?
It only hurts the process which does this and does not inflict any
latencies by holding hash_lock over the full search for a free spot.
> The only alternative solution I could come up with is a paritioned
> xarray where the index space would be segmented for each TGID, i.e.
>
> segment.start = TGID * MAX_TIMERS_PER_PROCESS
> segment.end = segment.start + MAX_TIMERS_PER_PROCESS - 1
>
> where MAX_TIMERS_PER_PROCESS could be a copius 2^16 which would work for
> both 32bit and 64bit TID limits.
>
> That would avoid the hash table lookups and the related issues, but OTH
> it would require to allocate one extra page per TGID if the application
> uses a single posix timer.
>
> Not sure whether that's worth it though.
More thoughts on this. If we go there and accept the extra page of
memory then we can just go all the way and make the xarray per process,
actually per signal.
That would also require another change, namely making the preallocated
sigqueue part of struct k_itimer, which in turn would not be the worst
of all ideas as it gets rid of the lookup in posixtimer_rearm() and
would also allow for some clever handling of the nasty SIG_IGN issues.
Though that's separate from the problem at hand.
Thanks,
tglx
On Mon, May 08 2023 at 23:57, Thomas Gleixner wrote:
> On Sat, May 06 2023 at 01:36, Thomas Gleixner wrote:
>> The only alternative solution I could come up with is a paritioned
>> xarray where the index space would be segmented for each TGID, i.e.
>>
>> segment.start = TGID * MAX_TIMERS_PER_PROCESS
>> segment.end = segment.start + MAX_TIMERS_PER_PROCESS - 1
>>
>> where MAX_TIMERS_PER_PROCESS could be a copius 2^16 which would work for
>> both 32bit and 64bit TID limits.
>>
>> That would avoid the hash table lookups and the related issues, but OTH
>> it would require to allocate one extra page per TGID if the application
>> uses a single posix timer.
>>
>> Not sure whether that's worth it though.
>
> More thoughts on this. If we go there and accept the extra page of
> memory then we can just go all the way and make the xarray per process,
> actually per signal.
Thinking more about it. The current scheme how timer ID allocation works
is really interesting vs. CRIU.
Assume a process creates/deletes timers frequently. It's not hard to
move the next ID close to INT_MAX, i.e. 2G
Now checkpoint that thing and restore it which means to do the
create/delete dance to move next ID up to the last one-1. Will only take
a couple of hours....
Thanks,
tglx
On Sat, May 06, 2023 at 01:36:22AM +0200, Thomas Gleixner wrote:
> On Sat, May 06 2023 at 00:58, Thomas Gleixner wrote:
> > On Fri, May 05 2023 at 16:50, Frederic Weisbecker wrote:
> > So the whole thing works like this:
> >
> > start = READ_LOCKLESS(sig->next_id);
> >
> > // Enfore that id and start are different to not terminate right away
> > id = ~start;
> >
> > loop:
> > if (id == start)
> > goto fail;
> > lock()
> > id = sig->next_id; <-- stable readout
> > sig->next_id = (id + 1) & INT_MAX; <-- prevent going negative
> >
> > if (unused_id(id)) {
> > add_timer_to_hash(timer, id);
> > unlock();
> > return id;
> > }
> > id++;
> > unlock();
> > goto loop;
> >
> > As the initial lockless readout is guaranteed to be in the positive
> > space, how is that supposed to be looping forever?
>
> Unless you think about the theoretical case of an unlimited number of
> threads sharing the signal_struct which all concurrently try to allocate
> a timer id and then releasing it immediately again (to avoid resource
> limit exhaustion). Theoretically possible, but is this a real concern
> with a timer ID space of 2G?
I didn't go that far actually, it was just me misunderstanding that loop and
especially the (id =~start) part. Now I got it.
I guess the for statement can just be:
for (; start != id; id++)
>
> I'm sure that it's incredibly hard to exploit this, but what's really
> bothering me is the hash table itself. The only reason why we have that
> is CRIU.
>
> The only alternative solution I could come up with is a paritioned
> xarray where the index space would be segmented for each TGID, i.e.
>
> segment.start = TGID * MAX_TIMERS_PER_PROCESS
> segment.end = segment.start + MAX_TIMERS_PER_PROCESS - 1
>
> where MAX_TIMERS_PER_PROCESS could be a copius 2^16 which would work for
> both 32bit and 64bit TID limits.
>
> That would avoid the hash table lookups and the related issues, but OTH
> it would require to allocate one extra page per TGID if the application
> uses a single posix timer.
>
> Not sure whether that's worth it though.
Not sure either...
Thanks.
On Tue, May 09 2023 at 11:42, Frederic Weisbecker wrote:
> On Sat, May 06, 2023 at 01:36:22AM +0200, Thomas Gleixner wrote:
>> That would avoid the hash table lookups and the related issues, but OTH
>> it would require to allocate one extra page per TGID if the application
>> uses a single posix timer.
>>
>> Not sure whether that's worth it though.
>
> Not sure either...
See my other reply on that...
On Tue, May 09 2023 at 11:42, Frederic Weisbecker wrote:
> On Sat, May 06, 2023 at 01:36:22AM +0200, Thomas Gleixner wrote:
>> Unless you think about the theoretical case of an unlimited number of
>> threads sharing the signal_struct which all concurrently try to allocate
>> a timer id and then releasing it immediately again (to avoid resource
>> limit exhaustion). Theoretically possible, but is this a real concern
>> with a timer ID space of 2G?
>
> I didn't go that far actually, it was just me misunderstanding that loop and
> especially the (id =~start) part. Now I got it.
>
> I guess the for statement can just be:
>
> for (; start != id; id++)
My brain based compiler complains about uninitialized usage of @id. I'm
pretty sure it's rightfully complaining and a real compiler would agree,
no?
Thanks,
tglx
On Tue, May 09 2023 at 11:30, Thomas Gleixner wrote:
> On Mon, May 08 2023 at 23:57, Thomas Gleixner wrote:
>> More thoughts on this. If we go there and accept the extra page of
>> memory then we can just go all the way and make the xarray per process,
>> actually per signal.
>
> Thinking more about it. The current scheme how timer ID allocation works
> is really interesting vs. CRIU.
>
> Assume a process creates/deletes timers frequently. It's not hard to
> move the next ID close to INT_MAX, i.e. 2G
>
> Now checkpoint that thing and restore it which means to do the
> create/delete dance to move next ID up to the last one-1. Will only take
> a couple of hours....
I'm cursing myself for overlooking this back then when the CRIU changes
to the timer ID management were made. Why?
Because that created an ABI which CRIU relies on.
The proper solution for this would be to make it possible to create a
timer with a given ID. That's not rocket science, but we need buy in
from the CRIU folks. Otherwise we are up the regression creek without a
paddle.
Thanks,
tglx
On Tue, May 09, 2023 at 02:38:50PM +0200, Thomas Gleixner wrote:
> On Tue, May 09 2023 at 11:42, Frederic Weisbecker wrote:
> > On Sat, May 06, 2023 at 01:36:22AM +0200, Thomas Gleixner wrote:
> >> Unless you think about the theoretical case of an unlimited number of
> >> threads sharing the signal_struct which all concurrently try to allocate
> >> a timer id and then releasing it immediately again (to avoid resource
> >> limit exhaustion). Theoretically possible, but is this a real concern
> >> with a timer ID space of 2G?
> >
> > I didn't go that far actually, it was just me misunderstanding that loop and
> > especially the (id =~start) part. Now I got it.
> >
> > I guess the for statement can just be:
> >
> > for (; start != id; id++)
>
> My brain based compiler complains about uninitialized usage of @id. I'm
> pretty sure it's rightfully complaining and a real compiler would agree,
> no?
*sigh* I should think more before pressing answer these days :)
Hi!
This is a summary of several mails so that the CRIU people have the full
picture.
A recent syzbot report made me look at the timer ID management, which
was modified 7 years ago to accomodate CRIU:
5ed67f05f66c ("posix timers: Allocate timer id per process (v2)")
and introduced that reported issue along with a bogus loop termination
problem. See
https://lore.kernel.org/lkml/[email protected]/
https://lore.kernel.org/lkml/[email protected]
for details.
The intent to make the timer IDs per process is definitely correct, but
the implementation is beyond suboptimal. I really regret that I did not
catch this back then when picking those changes up.
The way it works is that each process keeps a 'next_timer_id' which it
uses to allocate the next timer. That allows CRIU to reconstruct timers
with the same ID by walking the ID space via
do {
timer_create(&timer, ...., &id);
if (id == original_id)
goto success;
timer_delete(&timer);
} while (idspace_not_exhausted());
That works by some definition of works, but it is problematic in two ways:
1) As the timer ID space is up to INT_MAX, a process which creates and
deletes timers frequently, can easily move up close to the INT_MAX
id space over time.
If such a process is checkpointed and restored, then the above loop
will run for at least an hour to restore a single timer.
And no, this is not only a hypothetical issue. There are legitimate
scenarios where threads are created and the control thread arms a
posix CPU timer on them. Such threads can be torn down on a regular
base due to thread pool consolidations. As CRIU does not happen
every 5 minutes it's not completely unlikely that such a process
surives quite some time on a particular host and thereby approaches
the ID space limit.
Sure we can restrict the ID space to a way smaller number so the
search wraps around earlier, but what's a sensible limit?
Though restricting the ID space has its own issue vs. backwards
compability. A process which created a timer on an older kernel with
an ID larger than the newer kernels ID limit cannot longer be
restored on that newer kernel.
Aside of that it does not solve the other problem this created:
2) That change created an user space ABI, which means that the kernel
side has to stick with this next ID search mechanism forever.
That prevents to get rid of that global lock and hash table by
sticking an xarray into task::signal which makes a lot of sense in
terms of cache locality and gets rid of the extra timer list
management in task::signal. Making this change would be very useful
to address some other issues in the posix-timer code without
creating yet more duct tape horrors.
Such a change obviously has to aim for a dense ID space to keep the
memory overhead low, but that breaks existing CRIU userspace because
dense ID space and next ID search does not fit together.
Next ID search is obviously creating non-recoverable holes in the
case that timers are deleted afterwards.
A dense ID space approach can create holes too, but they are
recoverable and well within the resource limits, because the process
has to be able to create enough timers in the first place in order
to release those in the middle.
With the next ID search brute force recovery is not possible on a
kernel with dense ID as there is no way to create all intermediate
timers first before reaching the one at the far end due to resource
limits.
So because of that half thought out user space ABI we are now up the
regression creek without a paddle, unless CRIU can accomodate to a
different restore mechanism to lift this restriction from the kernel.
Thoughts?
Thanks,
tglx
On Tue, May 9, 2023 at 5:50 AM Thomas Gleixner <[email protected]> wrote:
>
> On Tue, May 09 2023 at 11:30, Thomas Gleixner wrote:
> > On Mon, May 08 2023 at 23:57, Thomas Gleixner wrote:
> >> More thoughts on this. If we go there and accept the extra page of
> >> memory then we can just go all the way and make the xarray per process,
> >> actually per signal.
> >
> > Thinking more about it. The current scheme how timer ID allocation works
> > is really interesting vs. CRIU.
> >
> > Assume a process creates/deletes timers frequently. It's not hard to
> > move the next ID close to INT_MAX, i.e. 2G
> >
> > Now checkpoint that thing and restore it which means to do the
> > create/delete dance to move next ID up to the last one-1. Will only take
> > a couple of hours....
>
> I'm cursing myself for overlooking this back then when the CRIU changes
> to the timer ID management were made. Why?
>
> Because that created an ABI which CRIU relies on.
>
> The proper solution for this would be to make it possible to create a
> timer with a given ID. That's not rocket science, but we need buy in
> from the CRIU folks. Otherwise we are up the regression creek without a
> paddle.
Let's go with the proper solution. I will prepare CRIU changes when
kernel patches are ready.
Thanks,
Andrei
On 10.05.2023 05:42, Thomas Gleixner wrote:
> Hi!
>
> This is a summary of several mails so that the CRIU people have the full
> picture.
>
> A recent syzbot report made me look at the timer ID management, which
> was modified 7 years ago to accomodate CRIU:
>
> 5ed67f05f66c ("posix timers: Allocate timer id per process (v2)")
>
> and introduced that reported issue along with a bogus loop termination
> problem. See
>
> https://lore.kernel.org/lkml/[email protected]/
> https://lore.kernel.org/lkml/[email protected]
>
> for details.
>
> The intent to make the timer IDs per process is definitely correct, but
> the implementation is beyond suboptimal. I really regret that I did not
> catch this back then when picking those changes up.
>
> The way it works is that each process keeps a 'next_timer_id' which it
> uses to allocate the next timer. That allows CRIU to reconstruct timers
> with the same ID by walking the ID space via
>
> do {
> timer_create(&timer, ...., &id);
> if (id == original_id)
> goto success;
> timer_delete(&timer);
> } while (idspace_not_exhausted());
>
> That works by some definition of works, but it is problematic in two ways:
>
> 1) As the timer ID space is up to INT_MAX, a process which creates and
> deletes timers frequently, can easily move up close to the INT_MAX
> id space over time.
>
> If such a process is checkpointed and restored, then the above loop
> will run for at least an hour to restore a single timer.
>
> And no, this is not only a hypothetical issue. There are legitimate
> scenarios where threads are created and the control thread arms a
> posix CPU timer on them. Such threads can be torn down on a regular
> base due to thread pool consolidations. As CRIU does not happen
> every 5 minutes it's not completely unlikely that such a process
> surives quite some time on a particular host and thereby approaches
> the ID space limit.
>
> Sure we can restrict the ID space to a way smaller number so the
> search wraps around earlier, but what's a sensible limit?
>
> Though restricting the ID space has its own issue vs. backwards
> compability. A process which created a timer on an older kernel with
> an ID larger than the newer kernels ID limit cannot longer be
> restored on that newer kernel.
>
> Aside of that it does not solve the other problem this created:
>
> 2) That change created an user space ABI, which means that the kernel
> side has to stick with this next ID search mechanism forever.
>
> That prevents to get rid of that global lock and hash table by
> sticking an xarray into task::signal which makes a lot of sense in
> terms of cache locality and gets rid of the extra timer list
> management in task::signal. Making this change would be very useful
> to address some other issues in the posix-timer code without
> creating yet more duct tape horrors.
>
> Such a change obviously has to aim for a dense ID space to keep the
> memory overhead low, but that breaks existing CRIU userspace because
> dense ID space and next ID search does not fit together.
>
> Next ID search is obviously creating non-recoverable holes in the
> case that timers are deleted afterwards.
>
> A dense ID space approach can create holes too, but they are
> recoverable and well within the resource limits, because the process
> has to be able to create enough timers in the first place in order
> to release those in the middle.
>
> With the next ID search brute force recovery is not possible on a
> kernel with dense ID as there is no way to create all intermediate
> timers first before reaching the one at the far end due to resource
> limits.
>
> So because of that half thought out user space ABI we are now up the
> regression creek without a paddle, unless CRIU can accomodate to a
> different restore mechanism to lift this restriction from the kernel.
>
> Thoughts?
Maybe we can do something similar to /proc/sys/kernel/ns_last_pid?
Switch to per-(process->signal) idr based approach with idr_set_cursor
to set next id for next posix timer from new sysctl?
>
> Thanks,
>
> tglx
>
>
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
On Tue, May 9, 2023 at 2:42 PM Thomas Gleixner <[email protected]> wrote:
>
> Hi!
>
> This is a summary of several mails so that the CRIU people have the full
> picture.
>
> A recent syzbot report made me look at the timer ID management, which
> was modified 7 years ago to accomodate CRIU:
>
> 5ed67f05f66c ("posix timers: Allocate timer id per process (v2)")
>
> and introduced that reported issue along with a bogus loop termination
> problem. See
>
> https://lore.kernel.org/lkml/[email protected]/
> https://lore.kernel.org/lkml/[email protected]
>
> for details.
>
> The intent to make the timer IDs per process is definitely correct, but
> the implementation is beyond suboptimal. I really regret that I did not
> catch this back then when picking those changes up.
>
> The way it works is that each process keeps a 'next_timer_id' which it
> uses to allocate the next timer. That allows CRIU to reconstruct timers
> with the same ID by walking the ID space via
>
> do {
> timer_create(&timer, ...., &id);
> if (id == original_id)
> goto success;
> timer_delete(&timer);
> } while (idspace_not_exhausted());
>
> That works by some definition of works, but it is problematic in two ways:
>
> 1) As the timer ID space is up to INT_MAX, a process which creates and
> deletes timers frequently, can easily move up close to the INT_MAX
> id space over time.
>
> If such a process is checkpointed and restored, then the above loop
> will run for at least an hour to restore a single timer.
>
> And no, this is not only a hypothetical issue. There are legitimate
> scenarios where threads are created and the control thread arms a
> posix CPU timer on them. Such threads can be torn down on a regular
> base due to thread pool consolidations. As CRIU does not happen
> every 5 minutes it's not completely unlikely that such a process
> surives quite some time on a particular host and thereby approaches
> the ID space limit.
>
> Sure we can restrict the ID space to a way smaller number so the
> search wraps around earlier, but what's a sensible limit?
>
> Though restricting the ID space has its own issue vs. backwards
> compability. A process which created a timer on an older kernel with
> an ID larger than the newer kernels ID limit cannot longer be
> restored on that newer kernel.
>
> Aside of that it does not solve the other problem this created:
>
> 2) That change created an user space ABI, which means that the kernel
> side has to stick with this next ID search mechanism forever.
>
> That prevents to get rid of that global lock and hash table by
> sticking an xarray into task::signal which makes a lot of sense in
> terms of cache locality and gets rid of the extra timer list
> management in task::signal. Making this change would be very useful
> to address some other issues in the posix-timer code without
> creating yet more duct tape horrors.
>
> Such a change obviously has to aim for a dense ID space to keep the
> memory overhead low, but that breaks existing CRIU userspace because
> dense ID space and next ID search does not fit together.
>
> Next ID search is obviously creating non-recoverable holes in the
> case that timers are deleted afterwards.
>
> A dense ID space approach can create holes too, but they are
> recoverable and well within the resource limits, because the process
> has to be able to create enough timers in the first place in order
> to release those in the middle.
>
> With the next ID search brute force recovery is not possible on a
> kernel with dense ID as there is no way to create all intermediate
> timers first before reaching the one at the far end due to resource
> limits.
>
> So because of that half thought out user space ABI we are now up the
> regression creek without a paddle, unless CRIU can accomodate to a
> different restore mechanism to lift this restriction from the kernel.
>
> Thoughts?
Hi Thomas,
If you give us a new API to create timers with specified id-s, we will
figure out how to live with it. It isn't good to ask users to update
CRIU to work on new kernels, but here are reasons and event improvements
for CRIU, so I think it's worth it.
As for API, we can use one bit of sigevent.sigev_notify to request a
timer with a specified id.
Thanks,
Andrei
Pavel!
On Wed, May 10 2023 at 12:36, Pavel Tikhomirov wrote:
> On 10.05.2023 05:42, Thomas Gleixner wrote:
>> So because of that half thought out user space ABI we are now up the
>> regression creek without a paddle, unless CRIU can accomodate to a
>> different restore mechanism to lift this restriction from the kernel.
>>
>> Thoughts?
>
> Maybe we can do something similar to /proc/sys/kernel/ns_last_pid?
> Switch to per-(process->signal) idr based approach with idr_set_cursor
> to set next id for next posix timer from new sysctl?
I'm not a fan of such sysctls. We have already too many of them and that
particular one does not buy much.
We can simply let timer_create() or a new syscall create a timer at a
given ID.
That allows CRIU to restore any checkpointed process no matter which
kernel version it came from without doing this insane create/delete
dance.
The downside is that this allows to create stupidly sparse timer IDs
even for the non CRIU case, which increases per process kernel memory
consumption and creates slightly more overhead in the signal delivery
path. The latter is a burden on the process owning the timer and not
affecting expiry, which is a context stealing operation. The memory part
needs eventually some thoughts vs. accounting.
If the 'explicit at ID' option is not used then the ID mechanism is
optimzied for dense IDs by using the first available ID in a bottom up
search, which recovers holes created by a timer_delete() operation.
Thanks,
tglx
On 10.05.2023 16:16, Andrey Vagin wrote:
> On Tue, May 9, 2023 at 2:42 PM Thomas Gleixner <[email protected]> wrote:
>>
>> Hi!
>>
>> This is a summary of several mails so that the CRIU people have the full
>> picture.
>>
>> A recent syzbot report made me look at the timer ID management, which
>> was modified 7 years ago to accomodate CRIU:
>>
>> 5ed67f05f66c ("posix timers: Allocate timer id per process (v2)")
>>
>> and introduced that reported issue along with a bogus loop termination
>> problem. See
>>
>> https://lore.kernel.org/lkml/[email protected]/
>> https://lore.kernel.org/lkml/[email protected]
>>
>> for details.
>>
>> The intent to make the timer IDs per process is definitely correct, but
>> the implementation is beyond suboptimal. I really regret that I did not
>> catch this back then when picking those changes up.
>>
>> The way it works is that each process keeps a 'next_timer_id' which it
>> uses to allocate the next timer. That allows CRIU to reconstruct timers
>> with the same ID by walking the ID space via
>>
>> do {
>> timer_create(&timer, ...., &id);
>> if (id == original_id)
>> goto success;
>> timer_delete(&timer);
>> } while (idspace_not_exhausted());
>>
>> That works by some definition of works, but it is problematic in two ways:
>>
>> 1) As the timer ID space is up to INT_MAX, a process which creates and
>> deletes timers frequently, can easily move up close to the INT_MAX
>> id space over time.
>>
>> If such a process is checkpointed and restored, then the above loop
>> will run for at least an hour to restore a single timer.
>>
>> And no, this is not only a hypothetical issue. There are legitimate
>> scenarios where threads are created and the control thread arms a
>> posix CPU timer on them. Such threads can be torn down on a regular
>> base due to thread pool consolidations. As CRIU does not happen
>> every 5 minutes it's not completely unlikely that such a process
>> surives quite some time on a particular host and thereby approaches
>> the ID space limit.
>>
>> Sure we can restrict the ID space to a way smaller number so the
>> search wraps around earlier, but what's a sensible limit?
>>
>> Though restricting the ID space has its own issue vs. backwards
>> compability. A process which created a timer on an older kernel with
>> an ID larger than the newer kernels ID limit cannot longer be
>> restored on that newer kernel.
>>
>> Aside of that it does not solve the other problem this created:
>>
>> 2) That change created an user space ABI, which means that the kernel
>> side has to stick with this next ID search mechanism forever.
>>
>> That prevents to get rid of that global lock and hash table by
>> sticking an xarray into task::signal which makes a lot of sense in
>> terms of cache locality and gets rid of the extra timer list
>> management in task::signal. Making this change would be very useful
>> to address some other issues in the posix-timer code without
>> creating yet more duct tape horrors.
>>
>> Such a change obviously has to aim for a dense ID space to keep the
>> memory overhead low, but that breaks existing CRIU userspace because
>> dense ID space and next ID search does not fit together.
>>
>> Next ID search is obviously creating non-recoverable holes in the
>> case that timers are deleted afterwards.
>>
>> A dense ID space approach can create holes too, but they are
>> recoverable and well within the resource limits, because the process
>> has to be able to create enough timers in the first place in order
>> to release those in the middle.
>>
>> With the next ID search brute force recovery is not possible on a
>> kernel with dense ID as there is no way to create all intermediate
>> timers first before reaching the one at the far end due to resource
>> limits.
>>
>> So because of that half thought out user space ABI we are now up the
>> regression creek without a paddle, unless CRIU can accomodate to a
>> different restore mechanism to lift this restriction from the kernel.
>>
>> Thoughts?
>
> Hi Thomas,
>
> If you give us a new API to create timers with specified id-s, we will
> figure out how to live with it. It isn't good to ask users to update
> CRIU to work on new kernels, but here are reasons and event improvements
> for CRIU, so I think it's worth it.
I agree, any API to create timers with specified id-s would work for new
CRIU versions.
>
> As for API, we can use one bit of sigevent.sigev_notify to request a
> timer with a specified id.
Yes, I agree, this kind of API is a good option.
>
> Thanks,
> Andrei
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
On 10.05.2023 16:30, Thomas Gleixner wrote:
> Pavel!
>
> On Wed, May 10 2023 at 12:36, Pavel Tikhomirov wrote:
>> On 10.05.2023 05:42, Thomas Gleixner wrote:
>>> So because of that half thought out user space ABI we are now up the
>>> regression creek without a paddle, unless CRIU can accomodate to a
>>> different restore mechanism to lift this restriction from the kernel.
>>>
>>> Thoughts?
>>
>> Maybe we can do something similar to /proc/sys/kernel/ns_last_pid?
>> Switch to per-(process->signal) idr based approach with idr_set_cursor
>> to set next id for next posix timer from new sysctl?
>
> I'm not a fan of such sysctls. We have already too many of them and that
> particular one does not buy much.
Sorry, it was a bad idea, what you suggest below is much better.
>
> We can simply let timer_create() or a new syscall create a timer at a
> given ID.
Yes this would work for CRIU. (note: in neighbor thread Andrei writes
about adding a bit to sigevent.sigev_notify to request a timer with a
specified id, new syscall is also a good option)
>
> That allows CRIU to restore any checkpointed process no matter which
> kernel version it came from without doing this insane create/delete
> dance.
Yes, for CRIU this kind of API change is a big improvement.
>
> The downside is that this allows to create stupidly sparse timer IDs
> even for the non CRIU case, which increases per process kernel memory
> consumption and creates slightly more overhead in the signal delivery
> path. The latter is a burden on the process owning the timer and not
> affecting expiry, which is a context stealing operation. The memory part
> needs eventually some thoughts vs. accounting.
>
> If the 'explicit at ID' option is not used then the ID mechanism is
> optimzied for dense IDs by using the first available ID in a bottom up
> search, which recovers holes created by a timer_delete() operation.
Not sure how kernel memory consumption increases with sparse timer IDs,
global hashtable (posix_timers_hashtable) is the same size anyway,
entries in hlists can be distributed differently as hash depends on id
directly but we have same number of entries. Probably I miss something,
why do we need dense IDs?
>
> Thanks,
>
> tglx
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
On Wed, May 10, 2023 at 01:16:26AM -0700, Andrey Vagin wrote:
...
> Hi Thomas,
>
> If you give us a new API to create timers with specified id-s, we will
> figure out how to live with it. It isn't good to ask users to update
> CRIU to work on new kernels, but here are reasons and event improvements
> for CRIU, so I think it's worth it.
>
> As for API, we can use one bit of sigevent.sigev_notify to request a
> timer with a specified id.
Which will do the trick but would look somehow strange I think, since signals
are not some how related to timer's ID. Another option might be to use output
`created_timer_id` parameter as an input cookie.
Say we describe input as
struct {
u32 magic;
timer_t timer_id;
};
Then if magic doesn't match we use `created_timer_id` for output only, and
otherwise we read `timer_id` from input and use it. Of course there is a
chance that some unitialized memory passed with existing old programs but
i think false positive gonna be very-very low if ever. Just IMHO.
Cyrill
On Thu, May 11, 2023 at 12:12:32PM +0800, Pavel Tikhomirov wrote:
> Not sure how kernel memory consumption increases with sparse timer IDs,
> global hashtable (posix_timers_hashtable) is the same size anyway, entries
> in hlists can be distributed differently as hash depends on id directly but
> we have same number of entries. Probably I miss something, why do we need
> dense IDs?
The proposal was to remove the global hash and use a signal_struct based
xarray instead.
On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
> On 10.05.2023 16:16, Andrey Vagin wrote:
>>>
>>> So because of that half thought out user space ABI we are now up the
>>> regression creek without a paddle, unless CRIU can accomodate to a
>>> different restore mechanism to lift this restriction from the kernel.
>>>
>> If you give us a new API to create timers with specified id-s, we will
>> figure out how to live with it. It isn't good to ask users to update
>> CRIU to work on new kernels, but here are reasons and event improvements
>> for CRIU, so I think it's worth it.
>
> I agree, any API to create timers with specified id-s would work for new
> CRIU versions.
The real question is whether this will cause any upheaval when a new
kernel meets a non-updated CRIU stack.
You know the UABI regression rules of the kernel...
Thanks,
tglx
On Thu, May 11 2023 at 12:12, Pavel Tikhomirov wrote:
> On 10.05.2023 16:30, Thomas Gleixner wrote:
>> The downside is that this allows to create stupidly sparse timer IDs
>> even for the non CRIU case, which increases per process kernel memory
>> consumption and creates slightly more overhead in the signal delivery
>> path. The latter is a burden on the process owning the timer and not
>> affecting expiry, which is a context stealing operation. The memory part
>> needs eventually some thoughts vs. accounting.
>>
>> If the 'explicit at ID' option is not used then the ID mechanism is
>> optimzied for dense IDs by using the first available ID in a bottom up
>> search, which recovers holes created by a timer_delete() operation.
>
> Not sure how kernel memory consumption increases with sparse timer IDs,
> global hashtable (posix_timers_hashtable) is the same size anyway,
> entries in hlists can be distributed differently as hash depends on id
> directly but we have same number of entries. Probably I miss something,
> why do we need dense IDs?
Because I want to get rid of the global hash table as I explained in my
summary mail.
Thanks,
tglx
On 11.05.2023 17:36, Thomas Gleixner wrote:
> On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
>> On 10.05.2023 16:16, Andrey Vagin wrote:
>>>>
>>>> So because of that half thought out user space ABI we are now up the
>>>> regression creek without a paddle, unless CRIU can accomodate to a
>>>> different restore mechanism to lift this restriction from the kernel.
>>>>
>>> If you give us a new API to create timers with specified id-s, we will
>>> figure out how to live with it. It isn't good to ask users to update
>>> CRIU to work on new kernels, but here are reasons and event improvements
>>> for CRIU, so I think it's worth it.
>>
>> I agree, any API to create timers with specified id-s would work for new
>> CRIU versions.
>
> The real question is whether this will cause any upheaval when a new
> kernel meets a non-updated CRIU stack.
Creation of posix timer would hang forever in this loop
https://github.com/checkpoint-restore/criu/blob/33dd66c6fc93c47213aaa0447a94d97ba1fa56ba/criu/pie/restorer.c#L1185
if old criu is run on new kernel (without consecutive id allocation) AFAICS.
> You know the UABI regression rules of the kernel...
>
> Thanks,
>
> tglx
>
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
From: Pavel Tikhomirov
> Sent: 10 May 2023 05:37
...
> > A dense ID space approach can create holes too, but they are
> > recoverable and well within the resource limits, because the process
> > has to be able to create enough timers in the first place in order
> > to release those in the middle.
While it doesn't help at all for creating items with fixed ids,
my 'favourite' scheme for allocating ids it to allocate a number
that will be a perfect hash onto an empty hash table slot.
The lookup check is then just array[id & mask].id == id.
A FIFO freelist can be run through the free entries and
the high bits incremented each time a slot is used.
So allocation is usually fixed cost.
If the table is full it's size can easily be doubled.
If the number of unused entries is doubled each time
the table size is doubled then the you (more or less)
guarantee that an id won't get reused any time soon
after it is freed.
This would be ok for restoring ids allocated by the same
scheme. But would need a fallback for restoring pathological
list of ids.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
On Thu, May 11 2023 at 17:52, Pavel Tikhomirov wrote:
> On 11.05.2023 17:36, Thomas Gleixner wrote:
>> On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
>>> On 10.05.2023 16:16, Andrey Vagin wrote:
>>>>>
>>>>> So because of that half thought out user space ABI we are now up the
>>>>> regression creek without a paddle, unless CRIU can accomodate to a
>>>>> different restore mechanism to lift this restriction from the kernel.
>>>>>
>>>> If you give us a new API to create timers with specified id-s, we will
>>>> figure out how to live with it. It isn't good to ask users to update
>>>> CRIU to work on new kernels, but here are reasons and event improvements
>>>> for CRIU, so I think it's worth it.
>>>
>>> I agree, any API to create timers with specified id-s would work for new
>>> CRIU versions.
>>
>> The real question is whether this will cause any upheaval when a new
>> kernel meets a non-updated CRIU stack.
>
> Creation of posix timer would hang forever in this loop
> https://github.com/checkpoint-restore/criu/blob/33dd66c6fc93c47213aaa0447a94d97ba1fa56ba/criu/pie/restorer.c#L1185
> if old criu is run on new kernel (without consecutive id allocation) AFAICS.
Yes, because that "sanity" check
if ((long)next_id > args->posix_timers[i].spt.it_id)
which tries to establish whether the kernel provides timer IDs in strict
increasing order does not work for that case.
It "works" to detect the IDR case on older kernels by chance, but not
under all circumstances. Assume the following case:
Global IDR has a free slot at index 1
Restore tries to create a timer for index 2
That will also loop forever, unless some other process creates a timer
and occupies the free slot at index 1, right?
So this needs a fix anyway, which should be done so that the new kernel
case is at least properly detected.
But even then there is still the problem of "it worked before I upgraded
the kernel".
IOW, we are still up a creek without a paddle, unless you would be
willing to utilize the existing CRIU bug to distribute the 'deal with
new kernel' mechanics as a bug bounty :)
Fix for the loop termination below.
Thanks,
tglx
---
criu/pie/restorer.c | 24 +++++++++++++-----------
1 file changed, 13 insertions(+), 11 deletions(-)
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -1169,10 +1169,10 @@ static int timerfd_arm(struct task_resto
static int create_posix_timers(struct task_restore_args *args)
{
int ret, i;
- kernel_timer_t next_id;
+ kernel_timer_t next_id, timer_id;
struct sigevent sev;
- for (i = 0; i < args->posix_timers_n; i++) {
+ for (i = 0, next_id = 0; i < args->posix_timers_n; i++) {
sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
sev.sigev_signo = args->posix_timers[i].spt.si_signo;
#ifdef __GLIBC__
@@ -1183,25 +1183,27 @@ static int create_posix_timers(struct ta
sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
while (1) {
- ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
+ ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id);
if (ret < 0) {
pr_err("Can't create posix timer - %d\n", i);
return ret;
}
- if (next_id == args->posix_timers[i].spt.it_id)
+ if (timer_id != next_id) {
+ pr_err("Can't create timers, kernel don't give them consequently\n");
+ return -1;
+ }
+
+ next_id++;
+
+ if (timer_id == args->posix_timers[i].spt.it_id)
break;
- ret = sys_timer_delete(next_id);
+ ret = sys_timer_delete(timer_id);
if (ret < 0) {
- pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
+ pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id);
return ret;
}
-
- if ((long)next_id > args->posix_timers[i].spt.it_id) {
- pr_err("Can't create timers, kernel don't give them consequently\n");
- return -1;
- }
}
}
On 11.05.2023 21:42, Thomas Gleixner wrote:
> On Thu, May 11 2023 at 17:52, Pavel Tikhomirov wrote:
>> On 11.05.2023 17:36, Thomas Gleixner wrote:
>>> On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
>>>> On 10.05.2023 16:16, Andrey Vagin wrote:
>>>>>>
>>>>>> So because of that half thought out user space ABI we are now up the
>>>>>> regression creek without a paddle, unless CRIU can accomodate to a
>>>>>> different restore mechanism to lift this restriction from the kernel.
>>>>>>
>>>>> If you give us a new API to create timers with specified id-s, we will
>>>>> figure out how to live with it. It isn't good to ask users to update
>>>>> CRIU to work on new kernels, but here are reasons and event improvements
>>>>> for CRIU, so I think it's worth it.
>>>>
>>>> I agree, any API to create timers with specified id-s would work for new
>>>> CRIU versions.
>>>
>>> The real question is whether this will cause any upheaval when a new
>>> kernel meets a non-updated CRIU stack.
>>
>> Creation of posix timer would hang forever in this loop
>> https://github.com/checkpoint-restore/criu/blob/33dd66c6fc93c47213aaa0447a94d97ba1fa56ba/criu/pie/restorer.c#L1185
>> if old criu is run on new kernel (without consecutive id allocation) AFAICS.
>
> Yes, because that "sanity" check
>
> if ((long)next_id > args->posix_timers[i].spt.it_id)
>
> which tries to establish whether the kernel provides timer IDs in strict
> increasing order does not work for that case.
Yes, this check is not perfect, but it does at least something: It
detects that posix timer creation missed needed id (if you start from 0
and increase by 1 each time you can not reach number > N before reaching
N).
>
> It "works" to detect the IDR case on older kernels by chance, but not
> under all circumstances. Assume the following case:
>
> Global IDR has a free slot at index 1
>
> Restore tries to create a timer for index 2
>
> That will also loop forever, unless some other process creates a timer
> and occupies the free slot at index 1, right?
Yes on old-old kernel, where there were no ids increasing on
create/delete, CRIU is broken, but CRIU does not support kernels earlier
than 3.11 (https://criu.org/Check_the_kernel#Basic) so probably we are fine.
git describe --contains 5ed67f05f66c
v3.10-rc1~171^2~9
>
> So this needs a fix anyway, which should be done so that the new kernel
> case is at least properly detected.
>
> But even then there is still the problem of "it worked before I upgraded
> the kernel".
>
> IOW, we are still up a creek without a paddle, unless you would be
> willing to utilize the existing CRIU bug to distribute the 'deal with
> new kernel' mechanics as a bug bounty :) >
> Fix for the loop termination below.
It fixes the loop for new kernel, I agree.
>
> Thanks,
>
> tglx
> ---
> criu/pie/restorer.c | 24 +++++++++++++-----------
> 1 file changed, 13 insertions(+), 11 deletions(-)
>
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -1169,10 +1169,10 @@ static int timerfd_arm(struct task_resto
> static int create_posix_timers(struct task_restore_args *args)
> {
> int ret, i;
> - kernel_timer_t next_id;
> + kernel_timer_t next_id, timer_id;
> struct sigevent sev;
>
> - for (i = 0; i < args->posix_timers_n; i++) {
> + for (i = 0, next_id = 0; i < args->posix_timers_n; i++) {
> sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
> sev.sigev_signo = args->posix_timers[i].spt.si_signo;
> #ifdef __GLIBC__
> @@ -1183,25 +1183,27 @@ static int create_posix_timers(struct ta
> sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
>
> while (1) {
> - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
> + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id);
> if (ret < 0) {
> pr_err("Can't create posix timer - %d\n", i);
> return ret;
> }
>
> - if (next_id == args->posix_timers[i].spt.it_id)
> + if (timer_id != next_id) {
> + pr_err("Can't create timers, kernel don't give them consequently\n");
> + return -1;
> + }
> +
> + next_id++;
> +
> + if (timer_id == args->posix_timers[i].spt.it_id)
> break;
>
> - ret = sys_timer_delete(next_id);
> + ret = sys_timer_delete(timer_id);
> if (ret < 0) {
> - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
> + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id);
> return ret;
> }
> -
> - if ((long)next_id > args->posix_timers[i].spt.it_id) {
> - pr_err("Can't create timers, kernel don't give them consequently\n");
> - return -1;
> - }
> }
> }
>
>
>
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
On 11.05.2023 21:42, Thomas Gleixner wrote:
> On Thu, May 11 2023 at 17:52, Pavel Tikhomirov wrote:
>> On 11.05.2023 17:36, Thomas Gleixner wrote:
>>> On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
>>>> On 10.05.2023 16:16, Andrey Vagin wrote:
>>>>>>
>>>>>> So because of that half thought out user space ABI we are now up the
>>>>>> regression creek without a paddle, unless CRIU can accomodate to a
>>>>>> different restore mechanism to lift this restriction from the kernel.
>>>>>>
>>>>> If you give us a new API to create timers with specified id-s, we will
>>>>> figure out how to live with it. It isn't good to ask users to update
>>>>> CRIU to work on new kernels, but here are reasons and event improvements
>>>>> for CRIU, so I think it's worth it.
>>>>
>>>> I agree, any API to create timers with specified id-s would work for new
>>>> CRIU versions.
>>>
>>> The real question is whether this will cause any upheaval when a new
>>> kernel meets a non-updated CRIU stack.
>>
>> Creation of posix timer would hang forever in this loop
>> https://github.com/checkpoint-restore/criu/blob/33dd66c6fc93c47213aaa0447a94d97ba1fa56ba/criu/pie/restorer.c#L1185
>> if old criu is run on new kernel (without consecutive id allocation) AFAICS.
>
> Yes, because that "sanity" check
>
> if ((long)next_id > args->posix_timers[i].spt.it_id)
>
> which tries to establish whether the kernel provides timer IDs in strict
> increasing order does not work for that case.
>
> It "works" to detect the IDR case on older kernels by chance, but not
> under all circumstances. Assume the following case:
>
> Global IDR has a free slot at index 1
>
> Restore tries to create a timer for index 2
>
> That will also loop forever, unless some other process creates a timer
> and occupies the free slot at index 1, right?
>
> So this needs a fix anyway, which should be done so that the new kernel
> case is at least properly detected.
>
> But even then there is still the problem of "it worked before I upgraded
> the kernel".
>
> IOW, we are still up a creek without a paddle, unless you would be
> willing to utilize the existing CRIU bug to distribute the 'deal with
> new kernel' mechanics as a bug bounty :)
>
> Fix for the loop termination below.
I've prepared a PR with your patch (with minimal change) and added you
Signed-off-by:, hope it's ok:
https://github.com/checkpoint-restore/criu/pull/2174
>
> Thanks,
>
> tglx
> ---
> criu/pie/restorer.c | 24 +++++++++++++-----------
> 1 file changed, 13 insertions(+), 11 deletions(-)
>
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -1169,10 +1169,10 @@ static int timerfd_arm(struct task_resto
> static int create_posix_timers(struct task_restore_args *args)
> {
> int ret, i;
> - kernel_timer_t next_id;
> + kernel_timer_t next_id, timer_id;
> struct sigevent sev;
>
> - for (i = 0; i < args->posix_timers_n; i++) {
> + for (i = 0, next_id = 0; i < args->posix_timers_n; i++) {
> sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
> sev.sigev_signo = args->posix_timers[i].spt.si_signo;
> #ifdef __GLIBC__
> @@ -1183,25 +1183,27 @@ static int create_posix_timers(struct ta
> sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
>
> while (1) {
> - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
> + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id);
> if (ret < 0) {
> pr_err("Can't create posix timer - %d\n", i);
> return ret;
> }
>
> - if (next_id == args->posix_timers[i].spt.it_id)
> + if (timer_id != next_id) {
> + pr_err("Can't create timers, kernel don't give them consequently\n");
> + return -1;
> + }
> +
> + next_id++;
> +
> + if (timer_id == args->posix_timers[i].spt.it_id)
> break;
>
> - ret = sys_timer_delete(next_id);
> + ret = sys_timer_delete(timer_id);
> if (ret < 0) {
> - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
> + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id);
> return ret;
> }
> -
> - if ((long)next_id > args->posix_timers[i].spt.it_id) {
> - pr_err("Can't create timers, kernel don't give them consequently\n");
> - return -1;
> - }
> }
> }
>
>
>
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
On Thu, May 11, 2023 at 2:36 AM Thomas Gleixner <[email protected]> wrote:
>
> On Thu, May 11 2023 at 11:17, Pavel Tikhomirov wrote:
> > On 10.05.2023 16:16, Andrey Vagin wrote:
> >>>
> >>> So because of that half thought out user space ABI we are now up the
> >>> regression creek without a paddle, unless CRIU can accomodate to a
> >>> different restore mechanism to lift this restriction from the kernel.
> >>>
> >> If you give us a new API to create timers with specified id-s, we will
> >> figure out how to live with it. It isn't good to ask users to update
> >> CRIU to work on new kernels, but here are reasons and event improvements
> >> for CRIU, so I think it's worth it.
> >
> > I agree, any API to create timers with specified id-s would work for new
> > CRIU versions.
>
> The real question is whether this will cause any upheaval when a new
> kernel meets a non-updated CRIU stack.
It depends on what you mean by upheaval. We found that CRIU can be stuck
in a busy loop with the new changes. I suggest thinking about how to
work around this case and make sure that CRIU reports an error. The
error should minimize the time that users will need to spend to find the
reason and ways to resolve the problem.
One of the ways to fix the problem is to return indexes in a backward
direction from INT_MAX to zero. But in the kernel, user indices can be
converted back to "normal" values:
kernel_timer_id = INT_MAX - user_timer_id;
I have one idea of how to make these changes without breaking CRIU. CRIU
does a few special things. First, it does all timer operations from a
thread leader. Second, it calls timer_settime only after creating all
timers. Third, it calls timer_delete for the last timer only. Any of
these events can be a trigger to switch to the new algo of allocating
timer id-s, but new processes allocate indices according to old rules.
It seems unfortunate that a real application will create a set of very
sparse indices without triggering one of these events.
I don't think that it is worth doing something like this, but if we want to
strictly follow the rules, it is the choice.
>
> You know the UABI regression rules of the kernel...
There is no rule without exceptions... With all pros and cons, we may
consider this case as an exception. From our side, we will try to make
everything to minimize the impact. Here are steps off the top of my
head:
* releasing the criu fix before the kernel release.
* update packages in Linux distros (Debian, Ubuntu, Fedora, and
others that we will find).
* send an announcement to the criu mailing list and to users that we know.
* add the error to FAQ.
* create a GitHub issue with a full description.
Thanks,
Andrei
Andrey!
On Thu, May 11 2023 at 18:21, Andrey Vagin wrote:
> On Thu, May 11, 2023 at 2:36 AM Thomas Gleixner <[email protected]> wrote:
>>
>> You know the UABI regression rules of the kernel...
>
> There is no rule without exceptions... With all pros and cons, we may
> consider this case as an exception. From our side, we will try to make
> everything to minimize the impact. Here are steps off the top of my
> head:
> * releasing the criu fix before the kernel release.
> * update packages in Linux distros (Debian, Ubuntu, Fedora, and
> others that we will find).
> * send an announcement to the criu mailing list and to users that we know.
> * add the error to FAQ.
> * create a GitHub issue with a full description.
Thanks for this plan. After digging deeper I managed to resolve the
actual problem I was chasing without changing that ID generator at
all.
The main pain point of having to do that lookup from the signal delivery
path is gone, which made it trivial to do the fix for the SIG_IGN mess
w/o these global lookups too.
Addressing this global ID issues I pointed out becomes therefore an
orthogonal issue which we can handle completely independent of the
kernel internal problems I'm trying to address.
I still think we should do that for sanity sake, but we can stage that
properly without dependencies outside of this particular ABI
problem. That makes me way more comfortable as that's something which
can be in the worst case reverted without doing any other damage.
Thanks,
tglx
posix_timer_add() tries to allocate a posix timer ID by starting from the
cached ID which was stored by the last successful allocation.
This is done in a loop searching the ID space for a free slot one by
one. The loop has to terminate when the search wrapped around to the
starting point.
But that's racy vs. establishing the starting point. That is read out
lockless, which leads to the following problem:
CPU0 CPU1
posix_timer_add()
start = sig->posix_timer_id;
lock(hash_lock);
... posix_timer_add()
if (++sig->posix_timer_id < 0)
start = sig->posix_timer_id;
sig->posix_timer_id = 0;
So CPU1 can observe a negative start value, i.e. -1, and the loop break
never happens because the condition can never be true:
if (sig->posix_timer_id == start)
break;
While this is unlikely to ever turn into an endless loop as the ID space is
huge (INT_MAX), the racy read of the start value caught the attention of
KCSAN and Dmitry unearthed that incorrectness.
Rewrite it so that all id operations are under the hash lock.
Reported-by: [email protected]
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
V2: Make the loop less hideous.
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 31 ++++++++++++++++++-------------
2 files changed, 19 insertions(+), 14 deletions(-)
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -135,7 +135,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,30 @@ static struct k_itimer *posix_timer_by_i
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
Le Thu, Jun 01, 2023 at 08:58:47PM +0200, Thomas Gleixner a ?crit :
> posix_timer_add() tries to allocate a posix timer ID by starting from the
> cached ID which was stored by the last successful allocation.
>
> This is done in a loop searching the ID space for a free slot one by
> one. The loop has to terminate when the search wrapped around to the
> starting point.
>
> But that's racy vs. establishing the starting point. That is read out
> lockless, which leads to the following problem:
>
> CPU0 CPU1
> posix_timer_add()
> start = sig->posix_timer_id;
> lock(hash_lock);
> ... posix_timer_add()
> if (++sig->posix_timer_id < 0)
> start = sig->posix_timer_id;
> sig->posix_timer_id = 0;
>
> So CPU1 can observe a negative start value, i.e. -1, and the loop break
> never happens because the condition can never be true:
>
> if (sig->posix_timer_id == start)
> break;
>
> While this is unlikely to ever turn into an endless loop as the ID space is
> huge (INT_MAX), the racy read of the start value caught the attention of
> KCSAN and Dmitry unearthed that incorrectness.
>
> Rewrite it so that all id operations are under the hash lock.
>
> Reported-by: [email protected]
> Reported-by: Dmitry Vyukov <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
The following commit has been merged into the timers/core branch of tip:
Commit-ID: cec58cad4614f44b76624f8d62ef771c8725c483
Gitweb: https://git.kernel.org/tip/cec58cad4614f44b76624f8d62ef771c8725c483
Author: Thomas Gleixner <[email protected]>
AuthorDate: Thu, 01 Jun 2023 20:58:47 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Mon, 05 Jun 2023 17:03:36 +02:00
posix-timers: Ensure timer ID search-loop limit is valid
posix_timer_add() tries to allocate a posix timer ID by starting from the
cached ID which was stored by the last successful allocation.
This is done in a loop searching the ID space for a free slot one by
one. The loop has to terminate when the search wrapped around to the
starting point.
But that's racy vs. establishing the starting point. That is read out
lockless, which leads to the following problem:
CPU0 CPU1
posix_timer_add()
start = sig->posix_timer_id;
lock(hash_lock);
... posix_timer_add()
if (++sig->posix_timer_id < 0)
start = sig->posix_timer_id;
sig->posix_timer_id = 0;
So CPU1 can observe a negative start value, i.e. -1, and the loop break
never happens because the condition can never be true:
if (sig->posix_timer_id == start)
break;
While this is unlikely to ever turn into an endless loop as the ID space is
huge (INT_MAX), the racy read of the start value caught the attention of
KCSAN and Dmitry unearthed that incorrectness.
Rewrite it so that all id operations are under the hash lock.
Reported-by: [email protected]
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
Link: https://lore.kernel.org/r/87bkhzdn6g.ffs@tglx
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 31 ++++++++++++++++++-------------
2 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 2009926..669e8cf 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -135,7 +135,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2d835c2..2c5daeb 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
The following commit has been merged into the timers/core branch of tip:
Commit-ID: 8ce8849dd1e78dadcee0ec9acbd259d239b7069f
Gitweb: https://git.kernel.org/tip/8ce8849dd1e78dadcee0ec9acbd259d239b7069f
Author: Thomas Gleixner <[email protected]>
AuthorDate: Thu, 01 Jun 2023 20:58:47 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Sun, 18 Jun 2023 22:41:48 +02:00
posix-timers: Ensure timer ID search-loop limit is valid
posix_timer_add() tries to allocate a posix timer ID by starting from the
cached ID which was stored by the last successful allocation.
This is done in a loop searching the ID space for a free slot one by
one. The loop has to terminate when the search wrapped around to the
starting point.
But that's racy vs. establishing the starting point. That is read out
lockless, which leads to the following problem:
CPU0 CPU1
posix_timer_add()
start = sig->posix_timer_id;
lock(hash_lock);
... posix_timer_add()
if (++sig->posix_timer_id < 0)
start = sig->posix_timer_id;
sig->posix_timer_id = 0;
So CPU1 can observe a negative start value, i.e. -1, and the loop break
never happens because the condition can never be true:
if (sig->posix_timer_id == start)
break;
While this is unlikely to ever turn into an endless loop as the ID space is
huge (INT_MAX), the racy read of the start value caught the attention of
KCSAN and Dmitry unearthed that incorrectness.
Rewrite it so that all id operations are under the hash lock.
Reported-by: [email protected]
Reported-by: Dmitry Vyukov <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Frederic Weisbecker <[email protected]>
Link: https://lore.kernel.org/r/87bkhzdn6g.ffs@tglx
---
include/linux/sched/signal.h | 2 +-
kernel/time/posix-timers.c | 31 ++++++++++++++++++-------------
2 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 2009926..669e8cf 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -135,7 +135,7 @@ struct signal_struct {
#ifdef CONFIG_POSIX_TIMERS
/* POSIX.1b Interval Timers */
- int posix_timer_id;
+ unsigned int next_posix_timer_id;
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ed3c4a9..2d6cf93 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -140,25 +140,30 @@ static struct k_itimer *posix_timer_by_id(timer_t id)
static int posix_timer_add(struct k_itimer *timer)
{
struct signal_struct *sig = current->signal;
- int first_free_id = sig->posix_timer_id;
struct hlist_head *head;
- int ret = -ENOENT;
+ unsigned int cnt, id;
- do {
+ /*
+ * FIXME: Replace this by a per signal struct xarray once there is
+ * a plan to handle the resulting CRIU regression gracefully.
+ */
+ for (cnt = 0; cnt <= INT_MAX; cnt++) {
spin_lock(&hash_lock);
- head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
- if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ id = sig->next_posix_timer_id;
+
+ /* Write the next ID back. Clamp it to the positive space */
+ sig->next_posix_timer_id = (id + 1) & INT_MAX;
+
+ head = &posix_timers_hashtable[hash(sig, id)];
+ if (!__posix_timers_find(head, sig, id)) {
hlist_add_head_rcu(&timer->t_hash, head);
- ret = sig->posix_timer_id;
+ spin_unlock(&hash_lock);
+ return id;
}
- if (++sig->posix_timer_id < 0)
- sig->posix_timer_id = 0;
- if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
- /* Loop over all possible ids completed */
- ret = -EAGAIN;
spin_unlock(&hash_lock);
- } while (ret == -ENOENT);
- return ret;
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)