The entropy_store::lock can not be acquired with disabled interrupts on
PREEMPT_RT because it is a sleeping lock. The lock is acquired from
hard-irq context while the primary interrupt handler is invoked.
The series restructures the code a little so the lock is only acquired
in process context on PREEMPT_RT while it remains as-is for non-RT.
The first two patches remove unused arguments from functions. The
following two reshuffle the code a little while the last one addresses
the PREEMPT_RT related issue.
Sebastian
The state of the fast_pool (number of added entropy, timestamp of last
addition) is reset after entropy has been consumed.
Move the reset of the fast_pool into the caller.
This is a preparations step to ease PREEMPT_RT support.
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
drivers/char/random.c | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index dfc38d87125f5..4bcaa7886201d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1242,37 +1242,35 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
return *ptr;
}
-static void process_interrupt_randomness_pool(struct fast_pool *fast_pool)
+static bool process_interrupt_randomness_pool(struct fast_pool *fast_pool)
{
struct entropy_store *r;
- unsigned long now = jiffies;
if (unlikely(crng_init == 0)) {
+ bool pool_reset = false;
+
if ((fast_pool->count >= 64) &&
crng_fast_load((char *) fast_pool->pool,
- sizeof(fast_pool->pool))) {
- fast_pool->count = 0;
- fast_pool->last = now;
- }
- return;
+ sizeof(fast_pool->pool)))
+ pool_reset = true;
+
+ return pool_reset;
}
if ((fast_pool->count < 64) &&
- !time_after(now, fast_pool->last + HZ))
- return;
+ !time_after(jiffies, fast_pool->last + HZ))
+ return false;
r = &input_pool;
if (!spin_trylock(&r->lock))
- return;
+ return false;
- fast_pool->last = now;
__mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool));
spin_unlock(&r->lock);
- fast_pool->count = 0;
-
/* award one bit for the contents of the fast pool */
credit_entropy_bits(r, 1);
+ return true;
}
void add_interrupt_randomness(int irq)
@@ -1298,7 +1296,10 @@ void add_interrupt_randomness(int irq)
fast_mix(fast_pool);
add_interrupt_bench(cycles);
- process_interrupt_randomness_pool(fast_pool);
+ if (process_interrupt_randomness_pool(fast_pool)) {
+ fast_pool->last = now;
+ fast_pool->count = 0;
+ }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
--
2.34.1
Since commit
ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter")
the irq_flags argument is no longer used.
Remove unused irq_flags irq_flags.
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Dexuan Cui <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Haiyang Zhang <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: K. Y. Srinivasan <[email protected]>
Cc: Stephen Hemminger <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Wei Liu <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
arch/x86/kernel/cpu/mshyperv.c | 2 +-
drivers/char/random.c | 4 ++--
drivers/hv/vmbus_drv.c | 2 +-
include/linux/random.h | 2 +-
kernel/irq/handle.c | 2 +-
5 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ff55df60228f7..2a0f836789118 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -79,7 +79,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
ack_APIC_irq();
set_irq_regs(old_regs);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 605969ed0f965..c8067c264a880 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -200,7 +200,7 @@
* void add_device_randomness(const void *buf, unsigned int size);
* void add_input_randomness(unsigned int type, unsigned int code,
* unsigned int value);
- * void add_interrupt_randomness(int irq, int irq_flags);
+ * void add_interrupt_randomness(int irq);
* void add_disk_randomness(struct gendisk *disk);
*
* add_device_randomness() is for adding data to the random pool that
@@ -1242,7 +1242,7 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
return *ptr;
}
-void add_interrupt_randomness(int irq, int irq_flags)
+void add_interrupt_randomness(int irq)
{
struct entropy_store *r;
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 392c1ac4f8193..7ae04ccb10438 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1381,7 +1381,7 @@ static void vmbus_isr(void)
tasklet_schedule(&hv_cpu->msg_dpc);
}
- add_interrupt_randomness(vmbus_interrupt, 0);
+ add_interrupt_randomness(vmbus_interrupt);
}
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
diff --git a/include/linux/random.h b/include/linux/random.h
index f45b8be3e3c4e..c45b2693e51fb 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {}
extern void add_input_randomness(unsigned int type, unsigned int code,
unsigned int value) __latent_entropy;
-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
+extern void add_interrupt_randomness(int irq) __latent_entropy;
extern void get_random_bytes(void *buf, int nbytes);
extern int wait_for_random_bytes(void);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27182003b879c..68c76c3caaf55 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -197,7 +197,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval = __handle_irq_event_percpu(desc, &flags);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
--
2.34.1
Split add_interrupt_randomness() into two parts:
- add_interrupt_randomness() which collects the entropy on the
invocation of a hardware interrupt and it feeds into the fast_pool,
a per-CPU variable (irq_randomness).
- process_interrupt_randomness_pool() which feeds the fast_pool/
irq_randomness into the entropy_store if enough entropy has been
gathered.
This is a preparations step to ease PREEMPT_RT support.
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
drivers/char/random.c | 47 +++++++++++++++++++++++++------------------
1 file changed, 27 insertions(+), 20 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index c8067c264a880..dfc38d87125f5 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1242,29 +1242,10 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
return *ptr;
}
-void add_interrupt_randomness(int irq)
+static void process_interrupt_randomness_pool(struct fast_pool *fast_pool)
{
struct entropy_store *r;
- struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
- struct pt_regs *regs = get_irq_regs();
unsigned long now = jiffies;
- cycles_t cycles = random_get_entropy();
- __u32 c_high, j_high;
- __u64 ip;
-
- if (cycles == 0)
- cycles = get_reg(fast_pool, regs);
- c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
- j_high = (sizeof(now) > 4) ? now >> 32 : 0;
- fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
- fast_pool->pool[1] ^= now ^ c_high;
- ip = regs ? instruction_pointer(regs) : _RET_IP_;
- fast_pool->pool[2] ^= ip;
- fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
- get_reg(fast_pool, regs);
-
- fast_mix(fast_pool);
- add_interrupt_bench(cycles);
if (unlikely(crng_init == 0)) {
if ((fast_pool->count >= 64) &&
@@ -1293,6 +1274,32 @@ void add_interrupt_randomness(int irq)
/* award one bit for the contents of the fast pool */
credit_entropy_bits(r, 1);
}
+
+void add_interrupt_randomness(int irq)
+{
+ struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
+ struct pt_regs *regs = get_irq_regs();
+ unsigned long now = jiffies;
+ cycles_t cycles = random_get_entropy();
+ __u32 c_high, j_high;
+ __u64 ip;
+
+ if (cycles == 0)
+ cycles = get_reg(fast_pool, regs);
+ c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
+ j_high = (sizeof(now) > 4) ? now >> 32 : 0;
+ fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
+ fast_pool->pool[1] ^= now ^ c_high;
+ ip = regs ? instruction_pointer(regs) : _RET_IP_;
+ fast_pool->pool[2] ^= ip;
+ fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
+ get_reg(fast_pool, regs);
+
+ fast_mix(fast_pool);
+ add_interrupt_bench(cycles);
+
+ process_interrupt_randomness_pool(fast_pool);
+}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
#ifdef CONFIG_BLOCK
--
2.34.1
On interrupt invocation, add_interrupt_randomness() adds entropy to its
per-CPU state and if it gathered enough of it then it will mix it into a
entropy_store. In order to do so, it needs to lock the pool by acquiring
entropy_store::lock which is a spinlock_t. This lock can not be acquired
on PREEMPT_RT with disabled interrupts because it is a sleeping lock.
This lock could be made a raw_spinlock_t which will then allow to
acquire it with disabled interrupts on PREEMPT_RT. The lock is usually
hold for short amount of cycles while entropy is added to the pool and
the invocation from the IRQ handler has a try-lock which avoids spinning
on the lock if contended. The extraction of entropy (extract_buf())
needs a few cycles more because it performs additionally few
SHA1 transformations. This takes around 5-10us on a testing box (E5-2650
32 Cores, 2way NUMA) and is negligible.
The frequent invocation of the IOCTLs RNDADDTOENTCNT and RNDRESEEDCRNG
on multiple CPUs in parallel leads to filling and depletion of the pool
which in turn results in heavy contention on the lock. The spinning with
disabled interrupts on multiple CPUs leads to latencies of at least
100us on the same machine which is no longer acceptable.
Collect only the IRQ randomness in IRQ-context on PREEMPT_RT.
In threaded-IRQ context, make a copy of the per-CPU state with disabled
interrupts to ensure that it is not modified while duplicated. Pass the
copy to process_interrupt_randomness_pool() and reset the per-CPU
afterwards if needed.
Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
drivers/char/random.c | 39 ++++++++++++++++++++++++++++++++++++---
include/linux/random.h | 1 +
kernel/irq/manage.c | 3 +++
3 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4bcaa7886201d..725af4bf76c0e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1273,6 +1273,32 @@ static bool process_interrupt_randomness_pool(struct fast_pool *fast_pool)
return true;
}
+#ifdef CONFIG_PREEMPT_RT
+void process_interrupt_randomness(void)
+{
+ struct fast_pool *cpu_pool;
+ struct fast_pool fast_pool;
+
+ lockdep_assert_irqs_enabled();
+
+ migrate_disable();
+ cpu_pool = this_cpu_ptr(&irq_randomness);
+
+ local_irq_disable();
+ memcpy(&fast_pool, cpu_pool, sizeof(fast_pool));
+ local_irq_enable();
+
+ if (process_interrupt_randomness_pool(&fast_pool)) {
+ local_irq_disable();
+ cpu_pool->last = jiffies;
+ cpu_pool->count = 0;
+ local_irq_enable();
+ }
+ memzero_explicit(&fast_pool, sizeof(fast_pool));
+ migrate_enable();
+}
+#endif
+
void add_interrupt_randomness(int irq)
{
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
@@ -1296,9 +1322,16 @@ void add_interrupt_randomness(int irq)
fast_mix(fast_pool);
add_interrupt_bench(cycles);
- if (process_interrupt_randomness_pool(fast_pool)) {
- fast_pool->last = now;
- fast_pool->count = 0;
+ /*
+ * On PREEMPT_RT the entropy can not be fed into the input_pool because
+ * it needs to acquire sleeping locks with disabled interrupts.
+ * This is deferred to the threaded handler.
+ */
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (process_interrupt_randomness_pool(fast_pool)) {
+ fast_pool->last = now;
+ fast_pool->count = 0;
+ }
}
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
diff --git a/include/linux/random.h b/include/linux/random.h
index c45b2693e51fb..a02c285a5ee52 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -36,6 +36,7 @@ static inline void add_latent_entropy(void) {}
extern void add_input_randomness(unsigned int type, unsigned int code,
unsigned int value) __latent_entropy;
extern void add_interrupt_randomness(int irq) __latent_entropy;
+extern void process_interrupt_randomness(void);
extern void get_random_bytes(void *buf, int nbytes);
extern int wait_for_random_bytes(void);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7405e384e5ed0..d641de1f879f4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1281,6 +1281,9 @@ static int irq_thread(void *data)
if (action_ret == IRQ_WAKE_THREAD)
irq_wake_secondary(desc, action);
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ process_interrupt_randomness();
+
wake_threads_waitq(desc);
}
--
2.34.1
On Tue, Dec 07, 2021 at 01:17:33PM +0100, Sebastian Andrzej Siewior wrote:
> Since commit
> ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter")
>
> the irq_flags argument is no longer used.
>
> Remove unused irq_flags irq_flags.
Two irq_flags here.
[...]
> struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 392c1ac4f8193..7ae04ccb10438 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1381,7 +1381,7 @@ static void vmbus_isr(void)
> tasklet_schedule(&hv_cpu->msg_dpc);
> }
>
> - add_interrupt_randomness(vmbus_interrupt, 0);
> + add_interrupt_randomness(vmbus_interrupt);
> }
Acked-by: Wei Liu <[email protected]>
On Tue, Dec 7, 2021 at 1:17 PM Sebastian Andrzej Siewior
<[email protected]> wrote:
>
> Since commit
> ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter")
>
> the irq_flags argument is no longer used.
>
> Remove unused irq_flags irq_flags.
>
> Cc: Borislav Petkov <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Dexuan Cui <[email protected]>
> Cc: H. Peter Anvin <[email protected]>
> Cc: Haiyang Zhang <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: K. Y. Srinivasan <[email protected]>
> Cc: Stephen Hemminger <[email protected]>
> Cc: Thomas Gleixner <[email protected]>
> Cc: Wei Liu <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> ---
> arch/x86/kernel/cpu/mshyperv.c | 2 +-
> drivers/char/random.c | 4 ++--
> drivers/hv/vmbus_drv.c | 2 +-
> include/linux/random.h | 2 +-
> kernel/irq/handle.c | 2 +-
> 5 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index ff55df60228f7..2a0f836789118 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -79,7 +79,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
> inc_irq_stat(hyperv_stimer0_count);
> if (hv_stimer0_handler)
> hv_stimer0_handler();
> - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
> + add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
> ack_APIC_irq();
>
> set_irq_regs(old_regs);
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index 605969ed0f965..c8067c264a880 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -200,7 +200,7 @@
> * void add_device_randomness(const void *buf, unsigned int size);
> * void add_input_randomness(unsigned int type, unsigned int code,
> * unsigned int value);
> - * void add_interrupt_randomness(int irq, int irq_flags);
> + * void add_interrupt_randomness(int irq);
> * void add_disk_randomness(struct gendisk *disk);
> *
> * add_device_randomness() is for adding data to the random pool that
> @@ -1242,7 +1242,7 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
> return *ptr;
> }
>
> -void add_interrupt_randomness(int irq, int irq_flags)
> +void add_interrupt_randomness(int irq)
> {
> struct entropy_store *r;
> struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 392c1ac4f8193..7ae04ccb10438 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1381,7 +1381,7 @@ static void vmbus_isr(void)
> tasklet_schedule(&hv_cpu->msg_dpc);
> }
>
> - add_interrupt_randomness(vmbus_interrupt, 0);
> + add_interrupt_randomness(vmbus_interrupt);
> }
>
> static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
> diff --git a/include/linux/random.h b/include/linux/random.h
> index f45b8be3e3c4e..c45b2693e51fb 100644
> --- a/include/linux/random.h
> +++ b/include/linux/random.h
> @@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {}
>
> extern void add_input_randomness(unsigned int type, unsigned int code,
> unsigned int value) __latent_entropy;
> -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
> +extern void add_interrupt_randomness(int irq) __latent_entropy;
>
> extern void get_random_bytes(void *buf, int nbytes);
> extern int wait_for_random_bytes(void);
> diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
> index 27182003b879c..68c76c3caaf55 100644
> --- a/kernel/irq/handle.c
> +++ b/kernel/irq/handle.c
> @@ -197,7 +197,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
>
> retval = __handle_irq_event_percpu(desc, &flags);
>
> - add_interrupt_randomness(desc->irq_data.irq, flags);
> + add_interrupt_randomness(desc->irq_data.irq);
>
> if (!irq_settings_no_debug(desc))
> note_interrupt(desc, retval);
> --
> 2.34.1
>
Thanks for this. Sultan noticed the same thing a while back:
https://lore.kernel.org/lkml/20180430031222.mapajgwprkkr6p36@sultan-box/
I'll apply this and the subsequent one.
Regards,
Jason
Hi Sebastian,
Thanks for this series.
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1281,6 +1281,9 @@ static int irq_thread(void *data)
> if (action_ret == IRQ_WAKE_THREAD)
> irq_wake_secondary(desc, action);
>
> + if (IS_ENABLED(CONFIG_PREEMPT_RT))
> + process_interrupt_randomness();
> +
Adding a path from irq_thread() (and complicating the callgraph)
strikes me as a rather large hammer to solve this problem with. Maybe
it's the only way. But I wonder:
> on the lock if contended. The extraction of entropy (extract_buf())
> needs a few cycles more because it performs additionally few
> SHA1 transformations. This takes around 5-10us on a testing box (E5-2650
> 32 Cores, 2way NUMA) and is negligible.
> The frequent invocation of the IOCTLs RNDADDTOENTCNT and RNDRESEEDCRNG
> on multiple CPUs in parallel leads to filling and depletion of the pool
> which in turn results in heavy contention on the lock. The spinning with
> disabled interrupts on multiple CPUs leads to latencies of at least
> 100us on the same machine which is no longer acceptable.
I wonder if this problem would partially go away if, instead, I can
figure out how to make extract_buf() faster? I'd like to replace sha1
in there with something else anyway. I'm not sure what sorts of
speedups I could get, but perhaps something could be eked out. Would
this be viable? Or do you think that even with various speedups the
problem would still exist in one way or another?
Alternatively, is there a different locking scheme that would
prioritize the irq path mostly winning and having the ioctl path
spinning instead?
Also, just curious, what is running RNDRESEEDCRNG so much on a
PREEMPT_RT system and why?
Jason
On 2021-12-07 19:14:16 [+0100], Jason A. Donenfeld wrote:
> Hi Sebastian,
Hi Jason,
> > --- a/kernel/irq/manage.c
> > +++ b/kernel/irq/manage.c
> > @@ -1281,6 +1281,9 @@ static int irq_thread(void *data)
> > if (action_ret == IRQ_WAKE_THREAD)
> > irq_wake_secondary(desc, action);
> >
> > + if (IS_ENABLED(CONFIG_PREEMPT_RT))
> > + process_interrupt_randomness();
> > +
>
> Adding a path from irq_thread() (and complicating the callgraph)
> strikes me as a rather large hammer to solve this problem with. Maybe
> it's the only way. But I wonder:
Well, on PREEMPT_RT the option `threadirqs' is always on so every
threaded interrupt goes through this. This is ensures that the primary
handler does not contribute to the per-CPU random pool for nothing at
all. The only exception would be a non-threaded interrupt (like a
timer) on a CPU which has no threaded interrupts assigned.
> > on the lock if contended. The extraction of entropy (extract_buf())
> > needs a few cycles more because it performs additionally few
> > SHA1 transformations. This takes around 5-10us on a testing box (E5-2650
> > 32 Cores, 2way NUMA) and is negligible.
> > The frequent invocation of the IOCTLs RNDADDTOENTCNT and RNDRESEEDCRNG
> > on multiple CPUs in parallel leads to filling and depletion of the pool
> > which in turn results in heavy contention on the lock. The spinning with
> > disabled interrupts on multiple CPUs leads to latencies of at least
> > 100us on the same machine which is no longer acceptable.
>
> I wonder if this problem would partially go away if, instead, I can
> figure out how to make extract_buf() faster? I'd like to replace sha1
> in there with something else anyway. I'm not sure what sorts of
> speedups I could get, but perhaps something could be eked out. Would
> this be viable? Or do you think that even with various speedups the
> problem would still exist in one way or another?
I don't think that we would gain much in the end. I think the majority
of the problem is that I'm able to trigger lock contention from multiple
CPUs simultaneously. If you want me to, I could replace it with a
busy-loop which takes always 2us-5us or so and see where we get. The
memory access it probably where the jitter is coming from…
> Alternatively, is there a different locking scheme that would
> prioritize the irq path mostly winning and having the ioctl path
> spinning instead?
Even the IOCTL path must spin with disabled interrupts to avoid dead
locks. Therefore it makes no sense if attempt acquire the lock from
process or IRQ context. Something like
while (!raw_spin_trylock_irqsave())
cpu_relax()
might work "better" but then PeterZ will come after me with a torch and
a pitchfork. He might even do so for just mentioning this…
The raw_spin_lock_irqsave() implementation (on x86) uses a ticket system
which enforces a "fair queue" system so the CPUs wait time is "fair".
This isn't the case with hack above because CPU1 might get the lock
after 3us and then again and again while CPU7 and 9 wait for 60us+ with
no luck. CPU7 and 9 would remain preemptible which is good but then lose
a lot cycles doing nothing which is bad from the performance point of
view.
However, since you mentioned different locking scheme: The spinlock_t on
PREEMPT_RT does more or less this: It is a RT-MUTEX based implementation
which can be compared with the mutex_t. The waiter then sleeps and does
not spin on the lock and therefore I don't see the latency spikes.
However since the waiter sleeps, it can not be acquired from hard-IRQ
context which is where add_interrupt_randomness() is invoked.
> Also, just curious, what is running RNDRESEEDCRNG so much on a
> PREEMPT_RT system and why?
Nothing in particular. After switching the locks (and getting
crng_fast_load() out of the hard-irq context, too) I was looking at the
latency and it didn't look bad. So I started pushing all the buttons in
order to hammer on the locks and utilise the whole potential and see how
bad it can get. The potential worst case here so see if it safe swap the
locks.
> Jason
Sebastian
On 2021-12-07 21:10:39 [+0100], To Jason A. Donenfeld wrote:
> On 2021-12-07 19:14:16 [+0100], Jason A. Donenfeld wrote:
> > Hi Sebastian,
> Hi Jason,
Hi Jason,
is there anything I should do and didn't do or do you just need some
time to think?
Sebastian
Sebastian Andrzej Siewior <[email protected]> wrote:
>
> Even the IOCTL path must spin with disabled interrupts to avoid dead
> locks. Therefore it makes no sense if attempt acquire the lock from
> process or IRQ context. Something like
> while (!raw_spin_trylock_irqsave())
> cpu_relax()
What about the TCP socket locking model?
IOW, the user-space slow path will add itself to a backlog queue
during contention, and the interrupt fast path will schedule work
to process any user-space backlog on exit.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On 2021-12-17 13:23:38 [+1100], Herbert Xu wrote:
> Sebastian Andrzej Siewior <[email protected]> wrote:
> >
> > Even the IOCTL path must spin with disabled interrupts to avoid dead
> > locks. Therefore it makes no sense if attempt acquire the lock from
> > process or IRQ context. Something like
> > while (!raw_spin_trylock_irqsave())
> > cpu_relax()
>
> What about the TCP socket locking model?
>
> IOW, the user-space slow path will add itself to a backlog queue
> during contention, and the interrupt fast path will schedule work
> to process any user-space backlog on exit.
I'm sorry, I can't connect the dots here. I was trying to explain that
for the lock in question it is not possible to spin-wait without
disabling interrupts first.
> Cheers,
Sebastian
On Fri, Dec 17, 2021 at 10:00:52AM +0100, Sebastian Andrzej Siewior wrote:
>
> I'm sorry, I can't connect the dots here. I was trying to explain that
> for the lock in question it is not possible to spin-wait without
> disabling interrupts first.
That's precisely the problem the socket lock was designed to
solve.
The way it works is that the interrupt path obtains a normal
spin lock, then tests if the sleepable side is holding the resource
or not. If it is, then the interrupt-path work is added to a
linked list to be processed later.
On the sleepable side, you first obtain the spin lock, then
you check whether the resource is held (by another thread), if
it is you then add yourself to a wait queue and sleep (this is
essentially a home-made mutex).
The tricky bit is in the unlock path on the sleepable side, you
check whether any interrupt-path work has been postponed while
you were holding the resource and process them if they have been.
Take a look at lock_sock/release_sock and bh_lock_sock/bh_unlock_sock
in net/core/sock.c.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On 2021-12-18 14:24:09 [+1100], Herbert Xu wrote:
> On Fri, Dec 17, 2021 at 10:00:52AM +0100, Sebastian Andrzej Siewior wrote:
> >
> > I'm sorry, I can't connect the dots here. I was trying to explain that
> > for the lock in question it is not possible to spin-wait without
> > disabling interrupts first.
>
> That's precisely the problem the socket lock was designed to
> solve.
…
> Take a look at lock_sock/release_sock and bh_lock_sock/bh_unlock_sock
> in net/core/sock.c.
That one. I find the semantic difficult to understand. Some BH-user
check sock_owned_by_user(), some don't.
However, a semaphore should work. The atomic user would do
down_trylock() while preemptible caller would invoke down() and sleep
until the lock is available. Let me check if that works here…
> Cheers,
Sebastian
On Sun, Dec 19, 2021 at 10:48:12AM +0100, Sebastian Andrzej Siewior wrote:
>
> That one. I find the semantic difficult to understand. Some BH-user
> check sock_owned_by_user(), some don't.
> However, a semaphore should work. The atomic user would do
> down_trylock() while preemptible caller would invoke down() and sleep
> until the lock is available. Let me check if that works here…
Right, if your atomic user can simply discard the work then this
is enough.
In the network case, we can't just discard the packet so that's
why there is a backlog queue which the atomic user appends to
if the lock isn't available.
Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
Hey Sebastian,
Sorry for the delay getting back to you.
I think I understand the motivation for this patchset, and maybe it'll
turn out to be the only way of accomplishing what RT needs. But I
really don't like complicating the irq ingestion flow like that,
splitting the captured state into two pieces, and having multiple
entry points. It makes the whole thing more difficult to analyze and
maintain. Again, maybe we'll *have* to do this ultimately, but I want
to make sure we at least explore the alternatives fully.
One thing you brought up is that the place where a spinlock becomes
problematic for RT is if userspace goes completely nuts with
RNDRESEEDCRNG. If that's really the only place where contention might
be harmful, can't we employ other techniques there instead? For
example, just ratelimiting the frequency at which RNDRESEEDCRNG can be
called _before_ taking that lock, using the usual ratelimit.h
structure? Or, alternatively, what about a lock that very heavily
prioritizes acquisitions from the irq path rather than from the
userspace path? I think Herbert might have had a suggestion there
related to the net stack's sock lock?
Jason
On 2021-12-20 15:38:58 [+0100], Jason A. Donenfeld wrote:
> Hey Sebastian,
Hi Jason,
> I think I understand the motivation for this patchset, and maybe it'll
> turn out to be the only way of accomplishing what RT needs. But I
> really don't like complicating the irq ingestion flow like that,
> splitting the captured state into two pieces, and having multiple
> entry points. It makes the whole thing more difficult to analyze and
> maintain. Again, maybe we'll *have* to do this ultimately, but I want
> to make sure we at least explore the alternatives fully.
Sure.
> One thing you brought up is that the place where a spinlock becomes
> problematic for RT is if userspace goes completely nuts with
> RNDRESEEDCRNG. If that's really the only place where contention might
> be harmful, can't we employ other techniques there instead? For
> example, just ratelimiting the frequency at which RNDRESEEDCRNG can be
> called _before_ taking that lock, using the usual ratelimit.h
> structure?
ratelimit. Didn't think about it.
There is RNDRESEEDCRNG and RNDADDENTROPY from the user API and lets
ignore the kernel users for the moment.
With the DEFAULT_RATELIMIT_BURST we would allow 10 "concurrent"
operations. This isn't as bad as previously but still not perfect (the
latency number jumped up to 50us in a smoke test).
Also in the !__ratelimit() case we would have to return an error. This
in turn breaks the usecase where one invokes 11 times
ioctl(,RNDADDENTROPY,) with a smaller buffer instead once with a big
buffer. Not sure how bad this is but it creates corner cases…
We could also sleep & loop until __ratelimit() allows but it looks odd.
> Or, alternatively, what about a lock that very heavily
> prioritizes acquisitions from the irq path rather than from the
> userspace path? I think Herbert might have had a suggestion there
> related to the net stack's sock lock?
Using a semaphore might work. down_trylock() can be invoked from
hard-irq context while the preemptible context would use down() and
sleep if needed. add_interrupt_randomness() has already a trylock. We
have add_disk_randomness() which is invoked from IRQ-context (on !RT) so
we would have to use trylock there, too (for its mix_pool_bytes()
invocation).
The sock-lock is either always invoked from preemptible context or has a
plan B in the contended case if invoked from atomic context. I'm not
sure what plan B could be here in atomic context. I *think* we need to
do these things and can't delay them.
Now that I think about it, we could add a mutex_t which is acquired
first for the user-API part to ensure that there is only one at a time
(instead of using ratelimit). Assuming that there is nothing else in the
kernel that can hammer on the lock (getrandom() is kind of rate
limited). If we really want to go that road, I would have to retest and
see how long extract_buf() holds the lock acquired.
Either way, we still need to split out the possible crng_reseed()
invocations (if (crng_init < 2)) due to crng_state::lock which must not
be acquired on PREEMPT_RT in hard-irq context
(add_interrupt_randomness() -> credit_entropy_bits()). I *think* the
other places were fine.
> Jason
Sebastian
Hey Sebastian,
I spent the weekend thinking about this some more. I'm actually
warming up a bit to the general approach of the original solution
here, though still have questions. To summarize my understanding of
where we are:
Alternative solution we've been discussing:
- Replace spinlock_t with raw spinlocks.
- Ratelimit userspace-triggered latency inducing ioctls with
ratelimit() and an additional mutex of sorts.
- Result: pretty much the same structure we have now, but with some
added protection for PREEMPT_RT.
Your original solution:
- Absorb into the fast pool during the actual IRQ, but never dump it
into the main pool (nor fast load into the crng directly if
crng_init==0) from the hard irq.
- Instead, have irq_thread() check to see if the calling CPU's fast
pool is >= 64, and if so, dump it into the main pool (or fast load
into the crng directly if crng_init==0).
I have two questions about the implications of your original solution:
1) How often does irq_thread() run? With what we have now, we dump the
fast pool into the main pool at exactly 64 events. With what you're
proposing, we're now in >= 64 territory. How do we conceptualize how
far beyond 64 it's likely to grow before irq_thread() does something?
Is it easy to make guarantees like, "at most, probably around 17"? Or
is it potentially unbounded? Growing beyond isn't actually necessarily
a bad thing, but it could potentially *slow* the collection of
entropy. That probably matters more in the crng_init==0 mode, where
we're just desperate to get whatever we can as fast as we can. But
depending on how large that is, it could matter for the main case too.
Having some handle on the latency added here would be helpful for
thinking about this.
2) If we went with this solution, I think I'd prefer to actually do it
unconditionally, for PREEMPT_RT=y and PREEMPT_RT=n. It's easier to
track how this thing works if the state machine always works in one
way instead of two. It also makes thinking about performance margins
for the various components easier if there's only one way used. Do you
see any downsides in doing this unconditionally?
Regards,
Jason
On 2022-01-30 23:55:09 [+0100], Jason A. Donenfeld wrote:
> Hey Sebastian,
Hi,
> I spent the weekend thinking about this some more. I'm actually
> warming up a bit to the general approach of the original solution
> here, though still have questions. To summarize my understanding of
> where we are:
>
> Alternative solution we've been discussing:
> - Replace spinlock_t with raw spinlocks.
> - Ratelimit userspace-triggered latency inducing ioctls with
> ratelimit() and an additional mutex of sorts.
> - Result: pretty much the same structure we have now, but with some
> added protection for PREEMPT_RT.
>
> Your original solution:
> - Absorb into the fast pool during the actual IRQ, but never dump it
> into the main pool (nor fast load into the crng directly if
> crng_init==0) from the hard irq.
> - Instead, have irq_thread() check to see if the calling CPU's fast
> pool is >= 64, and if so, dump it into the main pool (or fast load
> into the crng directly if crng_init==0).
>
> I have two questions about the implications of your original solution:
>
> 1) How often does irq_thread() run? With what we have now, we dump the
Mostly every interrupt gets threaded. After the primary handler (the
in hard irq) was invoked, the threaded handler gets woken up. The
threaded handler runs before the primary can run again.
Not every interrupt gets threaded. For instance interrupts that are not
threaded are marked as TIMER, PER_CPU, ONESHOT and NO_THREAD.
So on a system with 4 CPUs you can move all peripheral interrupts to
CPU0 leaving CPU1-3 with TIMER interrupts only. In that case, there
would be no irq_thread() invocations on CPU1-3.
> fast pool into the main pool at exactly 64 events. With what you're
> proposing, we're now in >= 64 territory. How do we conceptualize how
> far beyond 64 it's likely to grow before irq_thread() does something?
in theory on a busy RT system (on the previously mentioned one) you
could process all HW interrupts (on CPU0) and wake interrupt threads but
the threads are blocked by a user task (the user land task has a higher
priority than the interrupt thread). In that time further HW interrupts
can trigger on CPU0 for the non-threaded interrupts like the timer
interrupt.
> Is it easy to make guarantees like, "at most, probably around 17"? Or
> is it potentially unbounded? Growing beyond isn't actually necessarily
> a bad thing, but it could potentially *slow* the collection of
> entropy. That probably matters more in the crng_init==0 mode, where
> we're just desperate to get whatever we can as fast as we can. But
> depending on how large that is, it could matter for the main case too.
> Having some handle on the latency added here would be helpful for
> thinking about this.
I have a bigger system which has only network and SATA and here:
PREEMPT, unpatched
[ 10.545739] random: crng init done
[ 10.549548] random: 7 urandom warning(s) missed due to ratelimiting
PREEMPT_RT, patched
[ 11.884035] random: crng init done
[ 11.884037] random: 7 urandom warning(s) missed due to ratelimiting
just from two boots, no real testing. I wouldn't even mention this as a
problem during boot-up.
> 2) If we went with this solution, I think I'd prefer to actually do it
> unconditionally, for PREEMPT_RT=y and PREEMPT_RT=n. It's easier to
> track how this thing works if the state machine always works in one
> way instead of two. It also makes thinking about performance margins
> for the various components easier if there's only one way used. Do you
> see any downsides in doing this unconditionally?
On !PREEMPT_RT you need to specify `threadirq` on the kernel command
line to enable threaded interrupts which are otherwise force-enabled on
PREEMPT_RT. To compensate for that, we would need something as backup.
Say a time or so…
> Regards,
> Jason
Sebastian
On PREEMPT_RT, it's problematic to take spinlocks from hard IRQ
handlers. We can fix this by deferring to a work queue the dumping of
the fast pool into the input pool.
We accomplish this by making `u8 count` an `atomic_t count`, with the
following rules:
- When it's incremented to >= 64, we schedule the work.
- If the top bit is set, we never schedule the work, even if >= 64.
- The worker is responsible for setting it back to 0 when it's done.
- If we need to retry the worker later, we clear the top bit.
In the worst case, an IRQ handler is mixing a new IRQ into the pool at
the same time as the worker is dumping it into the input pool. In this
case, we only ever set the count back to 0 _after_ we're done, so that
subsequent cycles will require a full 64 to dump it in again. In other
words, the result of this race is only ever adding a little bit more
information than normal, but never less, and never crediting any more
for this partial additional information.
Cc: Sebastian Andrzej Siewior <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Cc: Sultan Alsawaf <[email protected]>
Cc: Jonathan Neuschäfer <[email protected]>
Signed-off-by: Jason A. Donenfeld <[email protected]>
---
Sebastian - what do you think of this as a deferred scheme to get rid of
that locking? Any downsides of using workqueues like this?
drivers/char/random.c | 67 +++++++++++++++++++----------------
include/trace/events/random.h | 6 ----
2 files changed, 36 insertions(+), 37 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 455615ac169a..a74897fcb269 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -383,12 +383,6 @@ static void _mix_pool_bytes(const void *in, int nbytes)
blake2s_update(&input_pool.hash, in, nbytes);
}
-static void __mix_pool_bytes(const void *in, int nbytes)
-{
- trace_mix_pool_bytes_nolock(nbytes, _RET_IP_);
- _mix_pool_bytes(in, nbytes);
-}
-
static void mix_pool_bytes(const void *in, int nbytes)
{
unsigned long flags;
@@ -400,11 +394,13 @@ static void mix_pool_bytes(const void *in, int nbytes)
}
struct fast_pool {
- u32 pool[4];
+ struct work_struct mix;
unsigned long last;
+ u32 pool[4];
+ atomic_t count;
u16 reg_idx;
- u8 count;
};
+#define FAST_POOL_MIX_INFLIGHT (1U << 31)
/*
* This is a fast mixing routine used by the interrupt randomness
@@ -434,7 +430,6 @@ static void fast_mix(struct fast_pool *f)
f->pool[0] = a; f->pool[1] = b;
f->pool[2] = c; f->pool[3] = d;
- f->count++;
}
static void process_random_ready_list(void)
@@ -985,12 +980,37 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
return *ptr;
}
+static void mix_interrupt_randomness(struct work_struct *work)
+{
+ struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
+
+ fast_pool->last = jiffies;
+
+ /* Since this is the result of a trip through the scheduler, xor in
+ * a cycle counter. It can't hurt, and might help.
+ */
+ fast_pool->pool[3] ^= random_get_entropy();
+
+ if (unlikely(crng_init == 0)) {
+ if (crng_fast_load((u8 *)&fast_pool->pool, sizeof(fast_pool->pool)) > 0)
+ atomic_set(&fast_pool->count, 0);
+ else
+ atomic_and(~FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
+ return;
+ }
+
+ mix_pool_bytes(&fast_pool->pool, sizeof(fast_pool->pool));
+ atomic_set(&fast_pool->count, 0);
+ credit_entropy_bits(1);
+}
+
void add_interrupt_randomness(int irq)
{
struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
struct pt_regs *regs = get_irq_regs();
unsigned long now = jiffies;
cycles_t cycles = random_get_entropy();
+ unsigned int new_count;
u32 c_high, j_high;
u64 ip;
@@ -1008,29 +1028,14 @@ void add_interrupt_randomness(int irq)
fast_mix(fast_pool);
add_interrupt_bench(cycles);
- if (unlikely(crng_init == 0)) {
- if ((fast_pool->count >= 64) &&
- crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
- fast_pool->count = 0;
- fast_pool->last = now;
- }
- return;
+ new_count = (unsigned int)atomic_inc_return(&fast_pool->count);
+ if (new_count >= 64 && new_count < FAST_POOL_MIX_INFLIGHT &&
+ (time_after(now, fast_pool->last + HZ) || unlikely(crng_init == 0))) {
+ if (unlikely(!fast_pool->mix.func))
+ INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
+ atomic_or(FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
+ schedule_work(&fast_pool->mix);
}
-
- if ((fast_pool->count < 64) && !time_after(now, fast_pool->last + HZ))
- return;
-
- if (!spin_trylock(&input_pool.lock))
- return;
-
- fast_pool->last = now;
- __mix_pool_bytes(&fast_pool->pool, sizeof(fast_pool->pool));
- spin_unlock(&input_pool.lock);
-
- fast_pool->count = 0;
-
- /* award one bit for the contents of the fast pool */
- credit_entropy_bits(1);
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);
diff --git a/include/trace/events/random.h b/include/trace/events/random.h
index ad149aeaf42c..833f42afc70f 100644
--- a/include/trace/events/random.h
+++ b/include/trace/events/random.h
@@ -52,12 +52,6 @@ DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes,
TP_ARGS(bytes, IP)
);
-DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock,
- TP_PROTO(int bytes, unsigned long IP),
-
- TP_ARGS(bytes, IP)
-);
-
TRACE_EVENT(credit_entropy_bits,
TP_PROTO(int bits, int entropy_count, unsigned long IP),
--
2.35.0
On 2022-02-04 16:58:58 [+0100], Jason A. Donenfeld wrote:
> FWIW, the biggest issue with this
>
> On Fri, Feb 4, 2022 at 4:32 PM Jason A. Donenfeld <[email protected]> wrote:
> > +static void mix_interrupt_randomness(struct work_struct *work)
> > +{
> [...]
> > + if (unlikely(crng_init == 0)) {
> > + if (crng_fast_load((u8 *)&fast_pool->pool, sizeof(fast_pool->pool)) > 0)
> > + atomic_set(&fast_pool->count, 0);
> > + else
> > + atomic_and(~FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
> > + return;
> > + }
> [...]
> > void add_interrupt_randomness(int irq)
> > - if (unlikely(crng_init == 0)) {
> > - if ((fast_pool->count >= 64) &&
> > - crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
> > - fast_pool->count = 0;
> > - fast_pool->last = now;
> > - }
> > - return;
>
> The point of crng_fast_load is to shuffle bytes into the crng as fast
> as possible for very early boot usage. Deferring that to a workqueue
> seems problematic. So I think at the very least _that_ part will have
> to stay in the IRQ handler. That means we've still got a spinlock. But
> at least it's a less problematic one than the input pool spinlock, and
> perhaps we can deal with that some other way than this patch's
> approach.
RT wise we _could_ acquire that spinlock_t in IRQ context early during
boot as long as system_state < SYSTEM_SCHEDULING. After that, we could
dead lock.
> In other words, this approach for the calls to mix_pool_bytes, and a
> different approach for that call to crng_fast_load.
>
> Jason
Sebastian
Hi Sultan,
On Sat, Feb 5, 2022 at 5:02 AM Sultan Alsawaf <[email protected]> wrote:
> The __this_cpu_{ATOMIC_OP}() functions are for atomically performing a single
> per-CPU operation for the current CPU from contexts that permit CPU migration.
> Since this code is safe from CPU migrations (add_interrupt_randomness() runs in
> hardirq context), the atomic per-CPU helpers are unneeded. Instead of using
> __this_cpu_inc_return() and __this_cpu_or(), we can operate on the per-CPU
> pointer directly without any extra safety (e.g., `++fast_pool->count` can be
> used in place of `__this_cpu_inc_return(irq_randomness.count)`).
Oh, right, thanks. We're already in irq so we don't have to worried
about load,add,store being cut up in any way. I'll go back to simple
increments for v3.
Jason
On 2022-02-04 16:31:49 [+0100], Jason A. Donenfeld wrote:
> Sebastian - what do you think of this as a deferred scheme to get rid of
> that locking? Any downsides of using workqueues like this?
I backported additionally the commit
random: use computational hash for entropy extraction
and thrown both into my v5.17-RT tree. From the debugging it looks good.
Will do more testing more next week.
| <idle>-0 [005] dn.h2.. 9189.894548: workqueue_queue_work: work struct=00000000ad070cf1 function=mix_interrupt_randomness wor
|kqueue=events req_cpu=8192 cpu=5
| kworker/5:2-1071 [005] ....... 9189.894594: workqueue_execute_start: work struct 00000000ad070cf1: function mix_interrupt_randomness
| kworker/5:2-1071 [005] ....... 9189.894595: workqueue_execute_end: work struct 00000000ad070cf1: function mix_interrupt_randomness
> drivers/char/random.c | 67 +++++++++++++++++++----------------
> include/trace/events/random.h | 6 ----
> 2 files changed, 36 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index 455615ac169a..a74897fcb269 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -383,12 +383,6 @@ static void _mix_pool_bytes(const void *in, int nbytes)
> blake2s_update(&input_pool.hash, in, nbytes);
> }
>
> -static void __mix_pool_bytes(const void *in, int nbytes)
> -{
> - trace_mix_pool_bytes_nolock(nbytes, _RET_IP_);
> - _mix_pool_bytes(in, nbytes);
> -}
> -
> static void mix_pool_bytes(const void *in, int nbytes)
> {
> unsigned long flags;
> @@ -400,11 +394,13 @@ static void mix_pool_bytes(const void *in, int nbytes)
> }
>
> struct fast_pool {
> - u32 pool[4];
> + struct work_struct mix;
> unsigned long last;
> + u32 pool[4];
> + atomic_t count;
> u16 reg_idx;
> - u8 count;
> };
> +#define FAST_POOL_MIX_INFLIGHT (1U << 31)
>
> /*
> * This is a fast mixing routine used by the interrupt randomness
> @@ -434,7 +430,6 @@ static void fast_mix(struct fast_pool *f)
>
> f->pool[0] = a; f->pool[1] = b;
> f->pool[2] = c; f->pool[3] = d;
> - f->count++;
> }
>
> static void process_random_ready_list(void)
> @@ -985,12 +980,37 @@ static u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
> return *ptr;
> }
>
> +static void mix_interrupt_randomness(struct work_struct *work)
> +{
> + struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
> +
> + fast_pool->last = jiffies;
> +
> + /* Since this is the result of a trip through the scheduler, xor in
> + * a cycle counter. It can't hurt, and might help.
> + */
Please do a proper two line comment.
> + fast_pool->pool[3] ^= random_get_entropy();
> +
> + if (unlikely(crng_init == 0)) {
> + if (crng_fast_load((u8 *)&fast_pool->pool, sizeof(fast_pool->pool)) > 0)
> + atomic_set(&fast_pool->count, 0);
> + else
> + atomic_and(~FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
> + return;
> + }
> +
> + mix_pool_bytes(&fast_pool->pool, sizeof(fast_pool->pool));
> + atomic_set(&fast_pool->count, 0);
> + credit_entropy_bits(1);
> +}
> +
> void add_interrupt_randomness(int irq)
> {
> struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
> struct pt_regs *regs = get_irq_regs();
> unsigned long now = jiffies;
> cycles_t cycles = random_get_entropy();
> + unsigned int new_count;
> u32 c_high, j_high;
> u64 ip;
>
> @@ -1008,29 +1028,14 @@ void add_interrupt_randomness(int irq)
> fast_mix(fast_pool);
> add_interrupt_bench(cycles);
>
> - if (unlikely(crng_init == 0)) {
> - if ((fast_pool->count >= 64) &&
> - crng_fast_load((u8 *)fast_pool->pool, sizeof(fast_pool->pool)) > 0) {
> - fast_pool->count = 0;
> - fast_pool->last = now;
> - }
> - return;
> + new_count = (unsigned int)atomic_inc_return(&fast_pool->count);
> + if (new_count >= 64 && new_count < FAST_POOL_MIX_INFLIGHT &&
> + (time_after(now, fast_pool->last + HZ) || unlikely(crng_init == 0))) {
> + if (unlikely(!fast_pool->mix.func))
> + INIT_WORK(&fast_pool->mix, mix_interrupt_randomness);
> + atomic_or(FAST_POOL_MIX_INFLIGHT, &fast_pool->count);
No need for atomic. If this is truly per-CPU then there will be no
cross-CPU access, right?
Therefore I would suggest to use __this_cpu_inc_return() which would avoid
the sync prefix for the inc operation. Same for __this_cpu_or(). And you
could use unsigned int.
> + schedule_work(&fast_pool->mix);
schedule_work() has a check which ensures that the work is not scheduled
again if still pending. But we could consider it fast-path and say that
it makes sense to keep it.
You could use schedule_work_on() so it remains on the local CPU. Makes
probably even more sense on NUMA systems. Otherwise it is an unbound
worker and the scheduler (and even the worker)_could_ move it around.
With schedule_work_on() it still _could_ be moved to another CPU during
CPU hotplug. Therefore you should check in mix_interrupt_randomness() if
the worker is == this_cpu_ptr() and otherwise abort. Puh this asks for a
CPU-hotplug handler to reset FAST_POOL_MIX_INFLIGHT in the CPU-UP path.
That would be the price for using this_cpu_inc()/or ;)
This might even classify for using system_highpri_wq (e.g.
queue_work_on(raw_smp_processor_id(), system_highpri_wq, &fast_pool->mix);
).
> }
> -
> - if ((fast_pool->count < 64) && !time_after(now, fast_pool->last + HZ))
> - return;
> -
> - if (!spin_trylock(&input_pool.lock))
> - return;
> -
> - fast_pool->last = now;
> - __mix_pool_bytes(&fast_pool->pool, sizeof(fast_pool->pool));
> - spin_unlock(&input_pool.lock);
> -
> - fast_pool->count = 0;
> -
> - /* award one bit for the contents of the fast pool */
> - credit_entropy_bits(1);
> }
> EXPORT_SYMBOL_GPL(add_interrupt_randomness);
Sebastian
On Fri, Feb 04, 2022 at 09:47:23PM +0100, Sebastian Andrzej Siewior wrote:
> No need for atomic. If this is truly per-CPU then there will be no
> cross-CPU access, right?
> Therefore I would suggest to use __this_cpu_inc_return() which would avoid
> the sync prefix for the inc operation. Same for __this_cpu_or(). And you
> could use unsigned int.
Hi,
The __this_cpu_{ATOMIC_OP}() functions are for atomically performing a single
per-CPU operation for the current CPU from contexts that permit CPU migration.
Since this code is safe from CPU migrations (add_interrupt_randomness() runs in
hardirq context), the atomic per-CPU helpers are unneeded. Instead of using
__this_cpu_inc_return() and __this_cpu_or(), we can operate on the per-CPU
pointer directly without any extra safety (e.g., `++fast_pool->count` can be
used in place of `__this_cpu_inc_return(irq_randomness.count)`).
Sultan