The theory behind the jitter dance is that multiple things are poking at
the same cache line. This only works, however, if those things are
actually all in the same cache line. Ensure this is the case by aligning
the struct on the stack to the cache line size.
On x86, this indeed aligns the stack struct:
000000000000000c <try_to_generate_entropy>:
{
c: 55 push %rbp
- d: 53 push %rbx
- e: 48 83 ec 38 sub $0x38,%rsp
+ d: 48 89 e5 mov %rsp,%rbp
+ 10: 41 54 push %r12
+ 12: 53 push %rbx
+ 13: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
+ 17: 48 83 ec 40 sub $0x40,%rsp
Cc: Sultan Alsawaf <[email protected]>
Fixes: 50ee7529ec45 ("random: try to actively add entropy rather than passively wait for it")
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
drivers/char/random.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 67558b95d531..2494e08c76d8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
static void __cold try_to_generate_entropy(void)
{
enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
- struct entropy_timer_state stack;
+ struct entropy_timer_state stack ____cacheline_aligned;
unsigned int i, num_different = 0;
unsigned long last = random_get_entropy();
int cpu = -1;
--
2.38.1
Hi Jason,
On Wed, Nov 30, 2022 at 03:08:15AM +0100, Jason A. Donenfeld wrote:
> The theory behind the jitter dance is that multiple things are poking at
> the same cache line. This only works, however, if those things are
> actually all in the same cache line. Ensure this is the case by aligning
> the struct on the stack to the cache line size.
>
> On x86, this indeed aligns the stack struct:
>
> 000000000000000c <try_to_generate_entropy>:
> {
> c: 55 push %rbp
> - d: 53 push %rbx
> - e: 48 83 ec 38 sub $0x38,%rsp
> + d: 48 89 e5 mov %rsp,%rbp
> + 10: 41 54 push %r12
> + 12: 53 push %rbx
> + 13: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
> + 17: 48 83 ec 40 sub $0x40,%rsp
>
> Cc: Sultan Alsawaf <[email protected]>
> Fixes: 50ee7529ec45 ("random: try to actively add entropy rather than passively wait for it")
> Signed-off-by: Jason A. Donenfeld <[email protected]>
> ---
> drivers/char/random.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index 67558b95d531..2494e08c76d8 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> static void __cold try_to_generate_entropy(void)
> {
> enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> - struct entropy_timer_state stack;
> + struct entropy_timer_state stack ____cacheline_aligned;
Several years ago, there was a whole thing about how __attribute__((aligned)) to
more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
16-byte aligned. See
https://lore.kernel.org/linux-crypto/[email protected]/T/#t
IIRC, nothing was done about it at the time.
Has that been resolved in the intervening years?
- Eric
Hi Eric,
On Tue, Nov 29, 2022 at 08:55:48PM -0800, Eric Biggers wrote:
> Hi Jason,
>
> On Wed, Nov 30, 2022 at 03:08:15AM +0100, Jason A. Donenfeld wrote:
> > The theory behind the jitter dance is that multiple things are poking at
> > the same cache line. This only works, however, if those things are
> > actually all in the same cache line. Ensure this is the case by aligning
> > the struct on the stack to the cache line size.
> >
> > On x86, this indeed aligns the stack struct:
> >
> > 000000000000000c <try_to_generate_entropy>:
> > {
> > c: 55 push %rbp
> > - d: 53 push %rbx
> > - e: 48 83 ec 38 sub $0x38,%rsp
> > + d: 48 89 e5 mov %rsp,%rbp
> > + 10: 41 54 push %r12
> > + 12: 53 push %rbx
> > + 13: 48 83 e4 c0 and $0xffffffffffffffc0,%rsp
> > + 17: 48 83 ec 40 sub $0x40,%rsp
> >
> > Cc: Sultan Alsawaf <[email protected]>
> > Fixes: 50ee7529ec45 ("random: try to actively add entropy rather than passively wait for it")
> > Signed-off-by: Jason A. Donenfeld <[email protected]>
> > ---
> > drivers/char/random.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > index 67558b95d531..2494e08c76d8 100644
> > --- a/drivers/char/random.c
> > +++ b/drivers/char/random.c
> > @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> > static void __cold try_to_generate_entropy(void)
> > {
> > enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> > - struct entropy_timer_state stack;
> > + struct entropy_timer_state stack ____cacheline_aligned;
>
> Several years ago, there was a whole thing about how __attribute__((aligned)) to
> more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
> because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
> 16-byte aligned. See
> https://lore.kernel.org/linux-crypto/[email protected]/T/#t
>
> IIRC, nothing was done about it at the time.
>
> Has that been resolved in the intervening years?
Maybe things are different for ____cacheline_aligned, which is 64 bytes.
Reading that thread, it looks like it was a case of trying to align the
stack to 16 bytes, but gcc assumed 16 bytes already while the kernel
only gave it 8. So gcc didn't think it needed to emit any code to align
it. Here, though, it's 64, and gcc certainly isn't assuming 64-byte
stack alignment.
Looking at the codegen, gcc appears to doing `rsp = (rsp & ~63) - 64`,
which appears correct.
Jason
On Wed, Nov 30, 2022 at 11:04:23AM +0100, Jason A. Donenfeld wrote:
> > > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > > index 67558b95d531..2494e08c76d8 100644
> > > --- a/drivers/char/random.c
> > > +++ b/drivers/char/random.c
> > > @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> > > static void __cold try_to_generate_entropy(void)
> > > {
> > > enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> > > - struct entropy_timer_state stack;
> > > + struct entropy_timer_state stack ____cacheline_aligned;
> >
> > Several years ago, there was a whole thing about how __attribute__((aligned)) to
> > more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
> > because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
> > 16-byte aligned. See
> > https://lore.kernel.org/linux-crypto/[email protected]/T/#t
> >
> > IIRC, nothing was done about it at the time.
> >
> > Has that been resolved in the intervening years?
>
> Maybe things are different for ____cacheline_aligned, which is 64 bytes.
> Reading that thread, it looks like it was a case of trying to align the
> stack to 16 bytes, but gcc assumed 16 bytes already while the kernel
> only gave it 8. So gcc didn't think it needed to emit any code to align
> it. Here, though, it's 64, and gcc certainly isn't assuming 64-byte
> stack alignment.
>
> Looking at the codegen, gcc appears to doing `rsp = (rsp & ~63) - 64`,
> which appears correct.
Well, if gcc thinks the stack is already 16-byte aligned, then it would be
perfectly within its rights to do 'rsp = (rsp & ~47) - 64', right? You probably
don't want to be relying on an implementation detail of gcc codegen...
- Eric
Hi Eric,
On Wed, Nov 30, 2022 at 8:51 PM Eric Biggers <[email protected]> wrote:
>
> On Wed, Nov 30, 2022 at 08:31:33PM +0100, Jason A. Donenfeld wrote:
> > On Wed, Nov 30, 2022 at 7:59 PM Eric Biggers <[email protected]> wrote:
> > >
> > > On Wed, Nov 30, 2022 at 11:04:23AM +0100, Jason A. Donenfeld wrote:
> > > > > > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > > > > > index 67558b95d531..2494e08c76d8 100644
> > > > > > --- a/drivers/char/random.c
> > > > > > +++ b/drivers/char/random.c
> > > > > > @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> > > > > > static void __cold try_to_generate_entropy(void)
> > > > > > {
> > > > > > enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> > > > > > - struct entropy_timer_state stack;
> > > > > > + struct entropy_timer_state stack ____cacheline_aligned;
> > > > >
> > > > > Several years ago, there was a whole thing about how __attribute__((aligned)) to
> > > > > more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
> > > > > because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
> > > > > 16-byte aligned. See
> > > > > https://lore.kernel.org/linux-crypto/[email protected]/T/#t
> > > > >
> > > > > IIRC, nothing was done about it at the time.
> > > > >
> > > > > Has that been resolved in the intervening years?
> > > >
> > > > Maybe things are different for ____cacheline_aligned, which is 64 bytes.
> > > > Reading that thread, it looks like it was a case of trying to align the
> > > > stack to 16 bytes, but gcc assumed 16 bytes already while the kernel
> > > > only gave it 8. So gcc didn't think it needed to emit any code to align
> > > > it. Here, though, it's 64, and gcc certainly isn't assuming 64-byte
> > > > stack alignment.
> > > >
> > > > Looking at the codegen, gcc appears to doing `rsp = (rsp & ~63) - 64`,
> > > > which appears correct.
> > >
> > > Well, if gcc thinks the stack is already 16-byte aligned, then it would be
> > > perfectly within its rights to do 'rsp = (rsp & ~47) - 64', right? You probably
> > > don't want to be relying on an implementation detail of gcc codegen...
> >
> > The really pathological one would be ~48, which would just clear those
> > two extra bits. I can't imagine gcc or clang ever deciding to do that.
> > But I guess they could?
> >
> > What would you recommend here? kmalloc'ing it instead? Keeping things
> > as is with ____cacheline_aligned, since this has always been broken,
> > and it's not the end of the world? Something else?
>
> Well, other places in the kernel do the alignment manually:
>
> u8 __stack[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
> struct entropy_timer_state *stack = (void *)PTR_ALIGN(__stack, SMP_CACHE_BYTES);
>
> It's silly, but I'm not aware of a better option.
Well alright then, why not. I'll send a v2.
Jason
On Wed, Nov 30, 2022 at 08:31:33PM +0100, Jason A. Donenfeld wrote:
> On Wed, Nov 30, 2022 at 7:59 PM Eric Biggers <[email protected]> wrote:
> >
> > On Wed, Nov 30, 2022 at 11:04:23AM +0100, Jason A. Donenfeld wrote:
> > > > > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > > > > index 67558b95d531..2494e08c76d8 100644
> > > > > --- a/drivers/char/random.c
> > > > > +++ b/drivers/char/random.c
> > > > > @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> > > > > static void __cold try_to_generate_entropy(void)
> > > > > {
> > > > > enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> > > > > - struct entropy_timer_state stack;
> > > > > + struct entropy_timer_state stack ____cacheline_aligned;
> > > >
> > > > Several years ago, there was a whole thing about how __attribute__((aligned)) to
> > > > more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
> > > > because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
> > > > 16-byte aligned. See
> > > > https://lore.kernel.org/linux-crypto/[email protected]/T/#t
> > > >
> > > > IIRC, nothing was done about it at the time.
> > > >
> > > > Has that been resolved in the intervening years?
> > >
> > > Maybe things are different for ____cacheline_aligned, which is 64 bytes.
> > > Reading that thread, it looks like it was a case of trying to align the
> > > stack to 16 bytes, but gcc assumed 16 bytes already while the kernel
> > > only gave it 8. So gcc didn't think it needed to emit any code to align
> > > it. Here, though, it's 64, and gcc certainly isn't assuming 64-byte
> > > stack alignment.
> > >
> > > Looking at the codegen, gcc appears to doing `rsp = (rsp & ~63) - 64`,
> > > which appears correct.
> >
> > Well, if gcc thinks the stack is already 16-byte aligned, then it would be
> > perfectly within its rights to do 'rsp = (rsp & ~47) - 64', right? You probably
> > don't want to be relying on an implementation detail of gcc codegen...
>
> The really pathological one would be ~48, which would just clear those
> two extra bits. I can't imagine gcc or clang ever deciding to do that.
> But I guess they could?
>
> What would you recommend here? kmalloc'ing it instead? Keeping things
> as is with ____cacheline_aligned, since this has always been broken,
> and it's not the end of the world? Something else?
Well, other places in the kernel do the alignment manually:
u8 __stack[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
struct entropy_timer_state *stack = (void *)PTR_ALIGN(__stack, SMP_CACHE_BYTES);
It's silly, but I'm not aware of a better option.
- Eric
On Wed, Nov 30, 2022 at 7:59 PM Eric Biggers <[email protected]> wrote:
>
> On Wed, Nov 30, 2022 at 11:04:23AM +0100, Jason A. Donenfeld wrote:
> > > > diff --git a/drivers/char/random.c b/drivers/char/random.c
> > > > index 67558b95d531..2494e08c76d8 100644
> > > > --- a/drivers/char/random.c
> > > > +++ b/drivers/char/random.c
> > > > @@ -1262,7 +1262,7 @@ static void __cold entropy_timer(struct timer_list *timer)
> > > > static void __cold try_to_generate_entropy(void)
> > > > {
> > > > enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
> > > > - struct entropy_timer_state stack;
> > > > + struct entropy_timer_state stack ____cacheline_aligned;
> > >
> > > Several years ago, there was a whole thing about how __attribute__((aligned)) to
> > > more than 8 bytes doesn't actually work on stack variables in the kernel on x86,
> > > because the kernel only keeps the stack 8-byte aligned but gcc assumes it is
> > > 16-byte aligned. See
> > > https://lore.kernel.org/linux-crypto/[email protected]/T/#t
> > >
> > > IIRC, nothing was done about it at the time.
> > >
> > > Has that been resolved in the intervening years?
> >
> > Maybe things are different for ____cacheline_aligned, which is 64 bytes.
> > Reading that thread, it looks like it was a case of trying to align the
> > stack to 16 bytes, but gcc assumed 16 bytes already while the kernel
> > only gave it 8. So gcc didn't think it needed to emit any code to align
> > it. Here, though, it's 64, and gcc certainly isn't assuming 64-byte
> > stack alignment.
> >
> > Looking at the codegen, gcc appears to doing `rsp = (rsp & ~63) - 64`,
> > which appears correct.
>
> Well, if gcc thinks the stack is already 16-byte aligned, then it would be
> perfectly within its rights to do 'rsp = (rsp & ~47) - 64', right? You probably
> don't want to be relying on an implementation detail of gcc codegen...
The really pathological one would be ~48, which would just clear those
two extra bits. I can't imagine gcc or clang ever deciding to do that.
But I guess they could?
What would you recommend here? kmalloc'ing it instead? Keeping things
as is with ____cacheline_aligned, since this has always been broken,
and it's not the end of the world? Something else?
Jason
The theory behind the jitter dance is that multiple things are poking at
the same cache line. This only works, however, if what's being poked at
is actually all in the same cache line. Ensure this is the case by
aligning the struct on the stack to the cache line size.
We can't use ____cacheline_aligned on a stack variable, because gcc
assumes 16 byte alignment when only 8 byte alignment is provided by the
kernel, which means gcc could technically do something pathological
like `(rsp & ~48) - 64`. It doesn't, but rather than risk it, just do
the stack alignment manually with PTR_ALIGN and an oversized buffer.
Fixes: 50ee7529ec45 ("random: try to actively add entropy rather than passively wait for it")
Cc: Eric Biggers <[email protected]>
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
drivers/char/random.c | 33 +++++++++++++++++----------------
1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index acb9548a870e..46bb81c2da6e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1265,29 +1265,30 @@ static void __cold entropy_timer(struct timer_list *timer)
static void __cold try_to_generate_entropy(void)
{
enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
- struct entropy_timer_state stack;
+ u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
+ struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
unsigned int i, num_different = 0;
unsigned long last = random_get_entropy();
int cpu = -1;
for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
- stack.entropy = random_get_entropy();
- if (stack.entropy != last)
+ stack->entropy = random_get_entropy();
+ if (stack->entropy != last)
++num_different;
- last = stack.entropy;
+ last = stack->entropy;
}
- stack.samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
- if (stack.samples_per_bit > MAX_SAMPLES_PER_BIT)
+ stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
+ if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
return;
- atomic_set(&stack.samples, 0);
- timer_setup_on_stack(&stack.timer, entropy_timer, 0);
+ atomic_set(&stack->samples, 0);
+ timer_setup_on_stack(&stack->timer, entropy_timer, 0);
while (!crng_ready() && !signal_pending(current)) {
/*
* Check !timer_pending() and then ensure that any previous callback has finished
* executing by checking try_to_del_timer_sync(), before queueing the next one.
*/
- if (!timer_pending(&stack.timer) && try_to_del_timer_sync(&stack.timer) >= 0) {
+ if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) {
struct cpumask timer_cpus;
unsigned int num_cpus;
@@ -1312,20 +1313,20 @@ static void __cold try_to_generate_entropy(void)
} while (cpu == smp_processor_id() && num_cpus > 1);
/* Expiring the timer at `jiffies` means it's the next tick. */
- stack.timer.expires = jiffies;
+ stack->timer.expires = jiffies;
- add_timer_on(&stack.timer, cpu);
+ add_timer_on(&stack->timer, cpu);
preempt_enable();
}
- mix_pool_bytes(&stack.entropy, sizeof(stack.entropy));
+ mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
schedule();
- stack.entropy = random_get_entropy();
+ stack->entropy = random_get_entropy();
}
- mix_pool_bytes(&stack.entropy, sizeof(stack.entropy));
+ mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
- del_timer_sync(&stack.timer);
- destroy_timer_on_stack(&stack.timer);
+ del_timer_sync(&stack->timer);
+ destroy_timer_on_stack(&stack->timer);
}
--
2.38.1