LinuxLists.cc - [RFC PATCH 0/3] random: replace urandom pool with a CRNG

[permalink] [raw]

Subject: [PATCH 3/3] random: add interrupt callback to VMBus IRQ handler

From: Stephan Mueller <[email protected]>

The Hyper-V Linux Integration Services use the VMBus implementation for
communication with the Hypervisor. VMBus registers its own interrupt
handler that completely bypasses the common Linux interrupt handling.
This implies that the interrupt entropy collector is not triggered.

This patch adds the interrupt entropy collection callback into the VMBus
interrupt handler function.

Signed-off-by: Stephan Mueller <[email protected]>
Signed-off-by: Stephan Mueller <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
drivers/char/random.c | 1 +
drivers/hv/vmbus_drv.c | 3 +++
2 files changed, 4 insertions(+)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d5bb3b3..c3f17c9 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1133,6 +1133,7 @@ void add_interrupt_randomness(int irq, int irq_flags)
/* award one bit for the contents of the fast pool */
credit_entropy_bits(r, credit + 1);
}
+EXPORT_SYMBOL_GPL(add_interrupt_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 64713ff..9af61bb 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -41,6 +41,7 @@
#include <linux/ptrace.h>
#include <linux/screen_info.h>
#include <linux/kdebug.h>
+#include <linux/random.h>
#include "hyperv_vmbus.h"

static struct acpi_device *hv_acpi_dev;
@@ -801,6 +802,8 @@ static void vmbus_isr(void)
else
tasklet_schedule(hv_context.msg_dpc[cpu]);
}
+
+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
}

--
2.5.0

2016-05-02 06:27:18

[permalink] [raw]

Subject: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

The CRNG is faster, and we don't pretend to track entropy usage in the
CRNG any more.

Signed-off-by: Theodore Ts'o <[email protected]>
---
crypto/chacha20_generic.c | 61 ----------
drivers/char/random.c | 282 ++++++++++++++++++++++++++++++++++------------
include/crypto/chacha20.h | 1 +
lib/Makefile | 2 +-
lib/chacha20.c | 79 +++++++++++++
5 files changed, 294 insertions(+), 131 deletions(-)
create mode 100644 lib/chacha20.c

diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
index da9c899..1cab831 100644
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -15,72 +15,11 @@
#include <linux/module.h>
#include <crypto/chacha20.h>

-static inline u32 rotl32(u32 v, u8 n)
-{
- return (v << n) | (v >> (sizeof(v) * 8 - n));
-}
-
static inline u32 le32_to_cpuvp(const void *p)
{
return le32_to_cpup(p);
}

-static void chacha20_block(u32 *state, void *stream)
-{
- u32 x[16], *out = stream;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(x); i++)
- x[i] = state[i];
-
- for (i = 0; i < 20; i += 2) {
- x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
- x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
- x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
- x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
-
- x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
- x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
- x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
- x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
-
- x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
- x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
- x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
- x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
-
- x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
- x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
- x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
- x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
-
- x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
- x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
- x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
- x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
-
- x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
- x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
- x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
- x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
-
- x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
- x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
- x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
- x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
-
- x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
- x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
- x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
- x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
- }
-
- for (i = 0; i < ARRAY_SIZE(x); i++)
- out[i] = cpu_to_le32(x[i] + state[i]);
-
- state[12]++;
-}
-
static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
diff --git a/drivers/char/random.c b/drivers/char/random.c
index b583e53..95f4451 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -260,6 +260,7 @@
#include <linux/irq.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
+#include <crypto/chacha20.h>

#include <asm/processor.h>
#include <asm/uaccess.h>
@@ -412,6 +413,15 @@ static struct fasync_struct *fasync;
static DEFINE_SPINLOCK(random_ready_list_lock);
static LIST_HEAD(random_ready_list);

+/*
+ * crng_init = 0 --> Uninitialized
+ * 2 --> Initialized
+ * 3 --> Initialized from input_pool
+ */
+static int crng_init = 0;
+#define crng_ready() (likely(crng_init >= 2))
+static void process_random_ready_list(void);
+
/**********************************************************************
*
* OS independent entropy store. Here are the functions which handle
@@ -441,10 +451,13 @@ struct entropy_store {
__u8 last_data[EXTRACT_SIZE];
};

+static ssize_t extract_entropy(struct entropy_store *r, void *buf,
+ size_t nbytes, int min, int rsvd);
+
+static int crng_reseed(struct entropy_store *r);
static void push_to_pool(struct work_struct *work);
static __u32 input_pool_data[INPUT_POOL_WORDS];
static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
-static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];

static struct entropy_store input_pool = {
.poolinfo = &poolinfo_table[0],
@@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = {
push_to_pool),
};

-static struct entropy_store nonblocking_pool = {
- .poolinfo = &poolinfo_table[1],
- .name = "nonblocking",
- .pull = &input_pool,
- .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
- .pool = nonblocking_pool_data,
- .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
- push_to_pool),
-};
-
static __u32 const twist_table[8] = {
0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
@@ -677,12 +680,6 @@ retry:
if (!r->initialized && r->entropy_total > 128) {
r->initialized = 1;
r->entropy_total = 0;
- if (r == &nonblocking_pool) {
- prandom_reseed_late();
- process_random_ready_list();
- wake_up_all(&urandom_init_wait);
- pr_notice("random: %s pool is initialized\n", r->name);
- }
}

trace_credit_entropy_bits(r->name, nbits,
@@ -692,30 +689,27 @@ retry:
if (r == &input_pool) {
int entropy_bits = entropy_count >> ENTROPY_SHIFT;

+ if (crng_init < 3 && entropy_bits >= 128) {
+ (void) crng_reseed(r);
+ entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
+ }
+
/* should we wake readers? */
if (entropy_bits >= random_read_wakeup_bits) {
wake_up_interruptible(&random_read_wait);
kill_fasync(&fasync, SIGIO, POLL_IN);
}
/* If the input pool is getting full, send some
- * entropy to the two output pools, flipping back and
- * forth between them, until the output pools are 75%
- * full.
+ * entropy to the blocking pool until it is 75% full.
*/
if (entropy_bits > random_write_wakeup_bits &&
r->initialized &&
r->entropy_total >= 2*random_read_wakeup_bits) {
- static struct entropy_store *last = &blocking_pool;
struct entropy_store *other = &blocking_pool;

- if (last == &blocking_pool)
- other = &nonblocking_pool;
if (other->entropy_count <=
- 3 * other->poolinfo->poolfracbits / 4)
- last = other;
- if (last->entropy_count <=
- 3 * last->poolinfo->poolfracbits / 4) {
- schedule_work(&last->push_work);
+ 3 * other->poolinfo->poolfracbits / 4) {
+ schedule_work(&other->push_work);
r->entropy_total = 0;
}
}
@@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct entropy_store *r, int nbits)

/*********************************************************************
*
+ * CRNG using CHACHA20
+ *
+ *********************************************************************/
+
+#define CRNG_RESEED_INTERVAL (300*HZ)
+
+struct crng_state {
+ __u32 state[16];
+ unsigned long init_time;
+ spinlock_t lock;
+};
+
+struct crng_state primary_crng = {
+ .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
+};
+static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
+
+static void _initialize_crng(struct crng_state *crng)
+{
+ int i;
+ unsigned long rv;
+
+ memcpy(&crng->state[0], "expand 32-byte k", 16);
+ for (i = 4; i < 16; i++) {
+ if (!arch_get_random_seed_long(&rv) &&
+ !arch_get_random_long(&rv))
+ rv = random_get_entropy();
+ crng->state[i] ^= rv;
+ }
+ crng->init_time = jiffies - CRNG_RESEED_INTERVAL;
+}
+
+static void initialize_crng(struct crng_state *crng)
+{
+ _initialize_crng(crng);
+ spin_lock_init(&crng->lock);
+}
+
+static int crng_fast_load(__u32 pool[4])
+{
+ int i;
+ __u32 *p;
+
+ if (!spin_trylock(&primary_crng.lock))
+ return 0;
+ if (crng_ready()) {
+ spin_unlock(&primary_crng.lock);
+ return 0;
+ }
+ p = &primary_crng.state[4];
+ if (crng_init == 1)
+ p += 4;
+ for (i=0; i < 4; i++)
+ *p ^= pool[i];
+ if (crng_init++ >= 2)
+ wake_up_interruptible(&crng_init_wait);
+ pr_notice("random: crng_init %d\n", crng_init);
+ spin_unlock(&primary_crng.lock);
+ return 1;
+}
+
+/* Returns 1 on success */
+static int crng_reseed(struct entropy_store *r)
+{
+ unsigned long flags;
+ int ret = 0;
+ int i, num, num_words;
+ __u32 tmp[16];
+
+ spin_lock_irqsave(&primary_crng.lock, flags);
+ num = extract_entropy(r, tmp, 32, 16, 0);
+ if (num == 0)
+ goto out;
+ if (num < 16 || num > 32) {
+ WARN_ON(1);
+ pr_err("crng_reseed: num is %d?!?\n", num);
+ }
+ num_words = (num + 3) / 4;
+ for (i = 0; i < num_words; i++)
+ primary_crng.state[i+4] ^= tmp[i];
+ primary_crng.init_time = jiffies;
+ if (crng_init < 3) {
+ crng_init = 3;
+ process_random_ready_list();
+ wake_up_interruptible(&crng_init_wait);
+ pr_notice("random: crng_init 3\n");
+ }
+ ret = 1;
+out:
+ spin_unlock_irqrestore(&primary_crng.lock, flags);
+ return ret;
+}
+
+static inline void crng_wait_ready(void)
+{
+ wait_event_interruptible(crng_init_wait, crng_ready());
+}
+
+static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+{
+ unsigned long v, flags;
+ struct crng_state *crng = &primary_crng;
+
+ if (crng_init > 2 &&
+ time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
+ crng_reseed(&input_pool);
+ spin_lock_irqsave(&crng->lock, flags);
+ if (arch_get_random_long(&v))
+ crng->state[14] ^= v;
+ chacha20_block(&crng->state[0], out);
+ if (crng->state[12] == 0)
+ crng->state[13]++;
+ spin_unlock_irqrestore(&crng->lock, flags);
+}
+
+static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
+{
+ ssize_t ret = 0, i;
+ __u8 tmp[CHACHA20_BLOCK_SIZE];
+ int large_request = (nbytes > 256);
+
+ while (nbytes) {
+ if (large_request && need_resched()) {
+ if (signal_pending(current)) {
+ if (ret == 0)
+ ret = -ERESTARTSYS;
+ break;
+ }
+ schedule();
+ }
+
+ extract_crng(tmp);
+ i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
+ if (copy_to_user(buf, tmp, i)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ nbytes -= i;
+ buf += i;
+ ret += i;
+ }
+
+ /* Wipe data just written to memory */
+ memzero_explicit(tmp, sizeof(tmp));
+
+ return ret;
+}
+
+
+/*********************************************************************
+ *
* Entropy input management
*
*********************************************************************/
@@ -749,12 +895,12 @@ struct timer_rand_state {
#define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };

/*
- * Add device- or boot-specific data to the input and nonblocking
- * pools to help initialize them to unique values.
+ * Add device- or boot-specific data to the input pool to help
+ * initialize it.
*
- * None of this adds any entropy, it is meant to avoid the
- * problem of the nonblocking pool having similar initial state
- * across largely identical devices.
+ * None of this adds any entropy; it is meant to avoid the problem of
+ * the entropy pool having similar initial state across largely
+ * identical devices.
*/
void add_device_randomness(const void *buf, unsigned int size)
{
@@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned int size)
_mix_pool_bytes(&input_pool, buf, size);
_mix_pool_bytes(&input_pool, &time, sizeof(time));
spin_unlock_irqrestore(&input_pool.lock, flags);
-
- spin_lock_irqsave(&nonblocking_pool.lock, flags);
- _mix_pool_bytes(&nonblocking_pool, buf, size);
- _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
- spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

@@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
sample.jiffies = jiffies;
sample.cycles = random_get_entropy();
sample.num = num;
- r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+ r = &input_pool;
mix_pool_bytes(r, &sample, sizeof(sample));

/*
@@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags)
!time_after(now, fast_pool->last + HZ))
return;

- r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
+ if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
+ fast_pool->count = 0;
+ fast_pool->last = now;
+ return;
+ }
+
+ r = &input_pool;
if (!spin_trylock(&r->lock))
return;

@@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
*
*********************************************************************/

-static ssize_t extract_entropy(struct entropy_store *r, void *buf,
- size_t nbytes, int min, int rsvd);
-
/*
* This utility inline function is responsible for transferring entropy
* from the primary pool to the secondary extraction pool. We make
@@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf,
*/
void get_random_bytes(void *buf, int nbytes)
{
+ __u8 tmp[CHACHA20_BLOCK_SIZE];
+
#if DEBUG_RANDOM_BOOT > 0
- if (unlikely(nonblocking_pool.initialized == 0))
+ if (!crng_ready())
printk(KERN_NOTICE "random: %pF get_random_bytes called "
- "with %d bits of entropy available\n",
- (void *) _RET_IP_,
- nonblocking_pool.entropy_total);
+ "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
#endif
trace_get_random_bytes(nbytes, _RET_IP_);
- extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
+
+ while (nbytes >= CHACHA20_BLOCK_SIZE) {
+ extract_crng(buf);
+ buf += CHACHA20_BLOCK_SIZE;
+ nbytes -= CHACHA20_BLOCK_SIZE;
+ }
+
+ if (nbytes > 0) {
+ extract_crng(tmp);
+ memcpy(buf, tmp, nbytes);
+ memzero_explicit(tmp, nbytes);
+ }
}
EXPORT_SYMBOL(get_random_bytes);

@@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct random_ready_callback *rdy)
unsigned long flags;
int err = -EALREADY;

- if (likely(nonblocking_pool.initialized))
+ if (crng_ready())
return err;

owner = rdy->owner;
@@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct random_ready_callback *rdy)
return -ENOENT;

spin_lock_irqsave(&random_ready_list_lock, flags);
- if (nonblocking_pool.initialized)
+ if (crng_ready())
goto out;

owner = NULL;
@@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes)
}

if (nbytes)
- extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
+ get_random_bytes(p, nbytes);
}
EXPORT_SYMBOL(get_random_bytes_arch);

@@ -1395,7 +1550,7 @@ static int rand_initialize(void)
{
init_std_data(&input_pool);
init_std_data(&blocking_pool);
- init_std_data(&nonblocking_pool);
+ _initialize_crng(&primary_crng);
return 0;
}
early_initcall(rand_initialize);
@@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
int ret;

- if (unlikely(nonblocking_pool.initialized == 0))
- printk_once(KERN_NOTICE "random: %s urandom read "
- "with %d bits of entropy available\n",
- current->comm, nonblocking_pool.entropy_total);
-
+ crng_wait_ready();
nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
- ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
-
- trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
- ENTROPY_BITS(&input_pool));
+ ret = extract_crng_user(buf, nbytes);
+ trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
return ret;
}

@@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const char __user *buffer,
{
size_t ret;

- ret = write_pool(&blocking_pool, buffer, count);
- if (ret)
- return ret;
- ret = write_pool(&nonblocking_pool, buffer, count);
+ ret = write_pool(&input_pool, buffer, count);
if (ret)
return ret;

@@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
input_pool.entropy_count = 0;
- nonblocking_pool.entropy_count = 0;
blocking_pool.entropy_count = 0;
return 0;
default:
@@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
if (flags & GRND_RANDOM)
return _random_read(flags & GRND_NONBLOCK, buf, count);

- if (unlikely(nonblocking_pool.initialized == 0)) {
+ if (!crng_ready()) {
if (flags & GRND_NONBLOCK)
return -EAGAIN;
- wait_event_interruptible(urandom_init_wait,
- nonblocking_pool.initialized);
+ crng_wait_ready();
if (signal_pending(current))
return -ERESTARTSYS;
}
diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
index 274bbae..20d20f68 100644
--- a/include/crypto/chacha20.h
+++ b/include/crypto/chacha20.h
@@ -16,6 +16,7 @@ struct chacha20_ctx {
u32 key[8];
};

+void chacha20_block(u32 *state, void *stream);
void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keysize);
diff --git a/lib/Makefile b/lib/Makefile
index 7bd6fd4..9ba27cd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o \
- sha1.o md5.o irq_regs.o argv_split.o \
+ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
proportions.o flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o nmi_backtrace.o
diff --git a/lib/chacha20.c b/lib/chacha20.c
new file mode 100644
index 0000000..250ceed
--- /dev/null
+++ b/lib/chacha20.c
@@ -0,0 +1,79 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+#include <crypto/chacha20.h>
+
+static inline u32 rotl32(u32 v, u8 n)
+{
+ return (v << n) | (v >> (sizeof(v) * 8 - n));
+}
+
+extern void chacha20_block(u32 *state, void *stream)
+{
+ u32 x[16], *out = stream;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(x); i++)
+ x[i] = state[i];
+
+ for (i = 0; i < 20; i += 2) {
+ x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
+ x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
+ x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
+ x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
+
+ x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
+ x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
+ x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
+ x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
+
+ x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
+ x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
+ x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
+ x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
+
+ x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
+ x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
+ x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
+ x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
+
+ x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
+ x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
+ x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
+ x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
+
+ x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
+ x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
+ x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
+ x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
+
+ x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
+ x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
+ x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
+ x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
+
+ x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
+ x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
+ x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
+ x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(x); i++)
+ out[i] = cpu_to_le32(x[i] + state[i]);
+
+ state[12]++;
+}
+EXPORT_SYMBOL(chacha20_block);
--
2.5.0

2016-05-02 06:26:52

[permalink] [raw]

Subject: [PATCH 2/3] random: make /dev/urandom scalable for silly userspace programs

On a system with a 4 socket (NUMA) system where a large number of
application processes were all trying to read from /dev/urandom, this
can result in the system spending 80% of its time contending on the
global urandom spinlock. The application have used its own PRNG, but
let's try to help it from running, lemming-like, straight over the
locking cliff.

Reported-by: Andi Kleen <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
drivers/char/random.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 95f4451..d5bb3b3 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -746,6 +746,17 @@ struct crng_state primary_crng = {
};
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);

+#ifdef CONFIG_NUMA
+/*
+ * Hack to deal with crazy userspace progams when they are all trying
+ * to access /dev/urandom in parallel. The programs are almost
+ * certainly doing something terribly wrong, but we'll work around
+ * their brain damage.
+ */
+static struct crng_state **crng_node_pool __read_mostly;
+#endif
+
+
static void _initialize_crng(struct crng_state *crng)
{
int i;
@@ -761,11 +772,13 @@ static void _initialize_crng(struct crng_state *crng)
crng->init_time = jiffies - CRNG_RESEED_INTERVAL;
}

+#ifdef CONFIG_NUMA
static void initialize_crng(struct crng_state *crng)
{
_initialize_crng(crng);
spin_lock_init(&crng->lock);
}
+#endif

static int crng_fast_load(__u32 pool[4])
{
@@ -822,19 +835,23 @@ out:
return ret;
}

+static inline void maybe_reseed_primary_crng(void)
+{
+ if (crng_init > 2 &&
+ time_after(jiffies, primary_crng.init_time + CRNG_RESEED_INTERVAL))
+ crng_reseed(&input_pool);
+}
+
static inline void crng_wait_ready(void)
{
wait_event_interruptible(crng_init_wait, crng_ready());
}

-static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+static void _extract_crng(struct crng_state *crng,
+ __u8 out[CHACHA20_BLOCK_SIZE])
{
unsigned long v, flags;
- struct crng_state *crng = &primary_crng;

- if (crng_init > 2 &&
- time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
- crng_reseed(&input_pool);
spin_lock_irqsave(&crng->lock, flags);
if (arch_get_random_long(&v))
crng->state[14] ^= v;
@@ -844,6 +861,30 @@ static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
spin_unlock_irqrestore(&crng->lock, flags);
}

+static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
+{
+#ifndef CONFIG_NUMA
+ maybe_reseed_primary_crng();
+ _extract_crng(&primary_crng, out);
+#else
+ int node_id = numa_node_id();
+ struct crng_state *crng = crng_node_pool[node_id];
+
+ if (time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL)) {
+ unsigned long flags;
+
+ maybe_reseed_primary_crng();
+ _extract_crng(&primary_crng, out);
+ spin_lock_irqsave(&crng->lock, flags);
+ memcpy(&crng->state[4], out, CHACHA20_KEY_SIZE);
+ crng->state[15] = numa_node_id();
+ crng->init_time = jiffies;
+ spin_unlock_irqrestore(&crng->lock, flags);
+ }
+ _extract_crng(crng, out);
+#endif
+}
+
static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
{
ssize_t ret = 0, i;
@@ -1548,6 +1589,22 @@ static void init_std_data(struct entropy_store *r)
*/
static int rand_initialize(void)
{
+#ifdef CONFIG_NUMA
+ int i;
+ int num_nodes = num_possible_nodes();
+ struct crng_state *crng;
+
+ crng_node_pool = kmalloc(num_nodes * sizeof(void *),
+ GFP_KERNEL|__GFP_NOFAIL);
+
+ for (i=0; i < num_nodes; i++) {
+ crng = kmalloc(sizeof(struct crng_state),
+ GFP_KERNEL | __GFP_NOFAIL);
+ initialize_crng(crng);
+ crng_node_pool[i] = crng;
+
+ }
+#endif
init_std_data(&input_pool);
init_std_data(&blocking_pool);
_initialize_crng(&primary_crng);
--
2.5.0

2016-05-02 07:00:30

[permalink] [raw]

Subject: Re: [PATCH 2/3] random: make /dev/urandom scalable for silly userspace programs

Am Montag, 2. Mai 2016, 02:26:52 schrieb Theodore Ts'o:

Hi Theodore,

I have not digested the patch set yet, but I have the following questions to
your patch set.

> On a system with a 4 socket (NUMA) system where a large number of
> application processes were all trying to read from /dev/urandom, this
> can result in the system spending 80% of its time contending on the
> global urandom spinlock. The application have used its own PRNG, but
> let's try to help it from running, lemming-like, straight over the
> locking cliff.

- initialization: In my DRBG based patch-set I tried serialize the
initialization of the per-NUMA node RNGs as follows: first the node 0 pool is
seeded completely, followed by the other nodes in a completely serial fashion.
If during that initialization time, say, node 3 wants some random number, but
the RNG for node 3 is not yet fully seeded, it goes back to the "default" RNG
of node 0. This way, it is ensured that we try to have properly seeded RNGs
even during heavy load at boot time. Would that make sense here?

- reseed avalanche: I see that you added a time-based reseed code too (I am
glad about that one). What I fear is that there is a reseed avalanche when the
various RNGs are seeded initially closely after each other (and thus the
reseed timer will expire at the same time). That would mean that they can be
reseeded all at the same time again when the timer based threshold expires and
drain the input_pool such that if you have many nodes, the input pool will not
have sufficient capacity (I am not speaking about entropy, but the potential
to store entropy) to satisfy all RNGs at the same time. Hence, we would then
have the potential to have entropy-starved RNGs.

- entropy pool draining: when having a timer-based reseeding on a quiet
system, the entropy pool can be drained during the expiry of the timer. So, I
tried to handle that by increasing the timer by, say, 100 seconds for each new
NUMA node. Note, even the baseline of 300 seconds with CRNG_RESEED_INTERVAL is
low. When I experimented with that on a KVM test system and left it quiet,
entropy pool draining was prevented at around 500 seconds.

Ciao
Stephan

2016-05-02 09:00:48

[permalink] [raw]

Subject: Re: [PATCH 3/3] random: add interrupt callback to VMBus IRQ handler

On Mon, May 2, 2016 at 2:26 AM, Theodore Ts'o <[email protected]> wrote:
> From: Stephan Mueller <[email protected]>
>
> The Hyper-V Linux Integration Services use the VMBus implementation for
> communication with the Hypervisor. VMBus registers its own interrupt
> handler that completely bypasses the common Linux interrupt handling.
> This implies that the interrupt entropy collector is not triggered.
> ...

Stephan correctly identified the problem of virtualized environments
in his paper, but there does not appear to be any real defenses in
place for VM rollback attacks.

Perhpas the following will make interesting reading:

* When Virtual is Harder than Real: Security Challenges in Virtual
Machine Based Computing Environments,
https://www.usenix.org/legacy/event/hotos05/final_papers/full_papers/garfinkel/garfinkel.pdf

* When Good Randomness Goes Bad: Virtual Machine Reset Vulnerabilities
and Hedging Deployed Cryptography,
http://pages.cs.wisc.edu/~rist/papers/sslhedge.pdf

Jeff

2016-05-02 09:14:28

[permalink] [raw]

Subject: Re: [PATCH 3/3] random: add interrupt callback to VMBus IRQ handler

Am Montag, 2. Mai 2016, 05:00:47 schrieb Jeffrey Walton:

Hi Jeffrey,

> On Mon, May 2, 2016 at 2:26 AM, Theodore Ts'o <[email protected]> wrote:
> > From: Stephan Mueller <[email protected]>
> >
> > The Hyper-V Linux Integration Services use the VMBus implementation for
> > communication with the Hypervisor. VMBus registers its own interrupt
> > handler that completely bypasses the common Linux interrupt handling.
> > This implies that the interrupt entropy collector is not triggered.
> > ...
>
> Stephan correctly identified the problem of virtualized environments
> in his paper, but there does not appear to be any real defenses in
> place for VM rollback attacks.

The issue the patch addresses is only that on Hyper-V with para-virt drivers,
the /dev/random implementation does not receive interrupts.

The issue of rollback (if you refer to activating an earlier saved image of
the guest) is a real issue the guest cannot do anything about it that is
effective (i.e. the guest can do without the help of the VMM). Note, rollback
is just a special case of a much broader issue of the duplication of the RNG
state by the VMM (be it snapshots, move of a guest to another VMM,
suspend/resume, ...). However, the patch to enable interrupts does not seem to
be related to that issue as interrupts are not re-issued in case of rollbacks,
are they?

Ciao
Stephan

2016-05-02 12:50:29

[permalink] [raw]

Subject: Re: [PATCH 2/3] random: make /dev/urandom scalable for silly userspace programs

On Mon, May 02, 2016 at 09:00:22AM +0200, Stephan Mueller wrote:
> - reseed avalanche: I see that you added a time-based reseed code too (I am
> glad about that one). What I fear is that there is a reseed avalanche when the
> various RNGs are seeded initially closely after each other (and thus the
> reseed timer will expire at the same time). That would mean that they can be
> reseeded all at the same time again when the timer based threshold expires and
> drain the input_pool such that if you have many nodes, the input pool will not
> have sufficient capacity (I am not speaking about entropy, but the potential
> to store entropy) to satisfy all RNGs at the same time. Hence, we would then
> have the potential to have entropy-starved RNGs.

The crng is a CRNG, not an entropy pool. So we don't pretend to track
entropy on the CRNG's at all. The current rule is that when you draw
from a crng, if it has been over 5 mintues, it will reseed from its
"parent" source. In the case of the primary_crng will draw between
128 and 256 bits of entropy from the input pool. In the per-NUMA node
case, they draw from the primary_crng.

So if there are many secondary (per-NUMA node) CRNG's that are seeded
within five minutes of each other, the input pool only gets drawn down
once to seed the primary_crng. The per-NUMA node crng's feed from the
primary crng, and absent some catastrophic security breach where the
adversary can read kernel memory (at which point you're toast anyway)
the output of the primary_crng is never exposed directly outside of
the system. So even if you have some crazy SGI system with 1024 NUMA
nodes, the primary_crng will only be generating at most 32k worth of
data to seed the secondary crng's before it gets reseed --- and the
input pool is only going to be debited at most 128-256 bits of entropy
each time.

I thought about using the primary_crng to serve double duty as the
CRNG for NUMA node 0, but I decided that on a NUMA system you have
TB's and TB's of memory, and so blowing another 80 bytes or so on a
separate primary_crng state makes the security analysis much simpler,
and the code much simpler. I also thought about only dynamically
initializing a node_id's CRNG if a spin_trylock on node 0's CRNG
failed, but again, decided against it in the itnerests of keeping
things simple and that NUMA people can afford to be profligate with
memory --- and they're blowing way more than 80 bytes per NUMA node
anyway. Besides, manufactuers of crazy-expensive NUMA systems have to
feed their children, too. :-)

> - entropy pool draining: when having a timer-based reseeding on a quiet
> system, the entropy pool can be drained during the expiry of the timer. So, I
> tried to handle that by increasing the timer by, say, 100 seconds for each new
> NUMA node. Note, even the baseline of 300 seconds with CRNG_RESEED_INTERVAL is
> low. When I experimented with that on a KVM test system and left it quiet,
> entropy pool draining was prevented at around 500 seconds.

Sure, but if no one is actually *using* the system, who cares about
whether the input pool's entropy is getting drawn down? The usual
reason why we might want to worry about reseeding frequently is if the
system is generating a huge amount of randomness for some reason.
This might be a good reason (you're running a IPSEC server and
generating lots of IKE session keys) or it might be for a really
stupid reason (dd if=/dev/urandom of=/dev/sdX bs=4k), but either way,
there will be lots of disk or networking interrupts to feed the input
pool.

I have thought about adding something a bit more sophisticated to
control the reseed logic (either tracking amount of data used, or
making the reseed interval adjustable, or dynamically adjustable), but
this was the simplest thing to do as a starting point. Besides for
the people who believe that it's realistic to write academic papers
about recovering from catastrophic security exposures where the bad
guy can read arbitrary kernel memory, and somehow _not_ managed to
bootstrap that into a full privilege escalation attack and installed a
backdoor into your BIOS so that you are permanently pwned, they might
be happy that we will be trying to recover within 5 minutes. :-)

- Ted

2016-05-02 12:56:18

[permalink] [raw]

Subject: Re: [PATCH 3/3] random: add interrupt callback to VMBus IRQ handler

On Mon, May 02, 2016 at 11:14:25AM +0200, Stephan Mueller wrote:
> The issue of rollback (if you refer to activating an earlier saved image of
> the guest) is a real issue the guest cannot do anything about it that is
> effective (i.e. the guest can do without the help of the VMM). Note, rollback
> is just a special case of a much broader issue of the duplication of the RNG
> state by the VMM (be it snapshots, move of a guest to another VMM,
> suspend/resume, ...). However, the patch to enable interrupts does not seem to
> be related to that issue as interrupts are not re-issued in case of rollbacks,
> are they?

Rollback is just a much broader issue of how can you maintain security
when the VMM is run by the NSA, and can do arbitrary things to mess
with the security of the guest OS (including reading keys straight out
of guest kernel memory, etc.). Hint: you can't. :-)

If we are talking about someone who is realistically trying to do
something useful with duplicating VMM state, I'm not aware of anyone
who is actually trying to clone a running VMM in order to launch new
worker nodes. People will clone disk snapshots to rapidly bring up
rapid nodes, and so making sure we have a way to handle cases where
you can't count on /var/state/random.seed on being useful is
important. The usual answer is to use something like virtio-rng, but
all of the answers are going to assume that the host system is
trustworthy.

If you are worried about a potential attack where the CIA has cut a
deal with Amazon AWS just as the NSA did with RSADSI and DUAL-EC DRBG,
you might as well go home...

- Ted

2016-05-02 13:49:07

[permalink] [raw]

Subject: Re: [PATCH 2/3] random: make /dev/urandom scalable for silly userspace programs

On Mon, May 02, 2016 at 08:50:14AM -0400, Theodore Ts'o wrote:
> > - entropy pool draining: when having a timer-based reseeding on a quiet
> > system, the entropy pool can be drained during the expiry of the timer. So, I
> > tried to handle that by increasing the timer by, say, 100 seconds for each new
> > NUMA node. Note, even the baseline of 300 seconds with CRNG_RESEED_INTERVAL is
> > low. When I experimented with that on a KVM test system and left it quiet,
> > entropy pool draining was prevented at around 500 seconds.

One other thought. If your KVM test system was completely quiet, then
all of the entropy was coming from timer interrupts. It is an open
question whether an adversary could predict the bit of "entropy" you
are generating with better than 50% probability if both the host and
the guest system are quiescent. And if they can, then maybe assuming
one bit of entropy per interrupt might be a too optimistic.

This is especially true on bare metal where very often, especially on
smaller machines, where there is a single oscillator from which all of
the clocks on the SOC or motherboard are derived. There is a reason
why I was being ultra conservative in sampling 64 interrupts into a
32-bit fast-mix pool before mixing it into the input pool, and only
crediting the pool with a single bit of entropy each time I did this.

(It's also because of this conservatism that I was comfortable with
having add_disk_randomness giving some extra credit for interrupts
that are probably more likely to be hard-to-predict by an adversary.
Especially if the interrupts are coming from a device with spinning
rust platters.)

- Ted

2016-05-02 13:53:58

[permalink] [raw]

Subject: Re: [PATCH 2/3] random: make /dev/urandom scalable for silly userspace programs

Am Montag, 2. Mai 2016, 09:48:57 schrieb Theodore Ts'o:

Hi Theodore,

> On Mon, May 02, 2016 at 08:50:14AM -0400, Theodore Ts'o wrote:
> > > - entropy pool draining: when having a timer-based reseeding on a quiet
> > > system, the entropy pool can be drained during the expiry of the timer.
> > > So, I tried to handle that by increasing the timer by, say, 100 seconds
> > > for each new NUMA node. Note, even the baseline of 300 seconds with
> > > CRNG_RESEED_INTERVAL is low. When I experimented with that on a KVM
> > > test system and left it quiet, entropy pool draining was prevented at
> > > around 500 seconds.
>
> One other thought. If your KVM test system was completely quiet, then
> all of the entropy was coming from timer interrupts. It is an open
> question whether an adversary could predict the bit of "entropy" you
> are generating with better than 50% probability if both the host and
> the guest system are quiescent. And if they can, then maybe assuming
> one bit of entropy per interrupt might be a too optimistic.

It is not entirely quiet -- systemd likes to dump data on the disk once in a
while. So, it is no timer interrupt that I see.

Note, my test system runs as tickless kernel.
>
> This is especially true on bare metal where very often, especially on
> smaller machines, where there is a single oscillator from which all of
> the clocks on the SOC or motherboard are derived. There is a reason
> why I was being ultra conservative in sampling 64 interrupts into a
> 32-bit fast-mix pool before mixing it into the input pool, and only
> crediting the pool with a single bit of entropy each time I did this.

As I do not think that we see any timer interrupts, I think this argument may
not count.

Besides, I have not seen any timer interrupts lately (with or without a
tickless kernel). Thus, which interrupt do you think is a timer interrupt?

Ciao
Stephan

2016-05-03 08:50:25

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o:

Hi Theodore,

> The CRNG is faster, and we don't pretend to track entropy usage in the
> CRNG any more.

In general, I have no concerns with this approach either. And thank you that
some of my concerns are addressed.

There are few more concerns left open. I would suggest I would write them up
with a proposal on how to address them.

Some comments inlne:
>
> Signed-off-by: Theodore Ts'o <[email protected]>
> ---
> crypto/chacha20_generic.c | 61 ----------
> drivers/char/random.c | 282
> ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h |
> 1 +
> lib/Makefile | 2 +-
> lib/chacha20.c | 79 +++++++++++++
> 5 files changed, 294 insertions(+), 131 deletions(-)
> create mode 100644 lib/chacha20.c
>
> diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
> index da9c899..1cab831 100644
> --- a/crypto/chacha20_generic.c
> +++ b/crypto/chacha20_generic.c
> @@ -15,72 +15,11 @@
> #include <linux/module.h>
> #include <crypto/chacha20.h>
>
> -static inline u32 rotl32(u32 v, u8 n)
> -{
> - return (v << n) | (v >> (sizeof(v) * 8 - n));
> -}
> -
> static inline u32 le32_to_cpuvp(const void *p)
> {
> return le32_to_cpup(p);
> }
>
> -static void chacha20_block(u32 *state, void *stream)
> -{
> - u32 x[16], *out = stream;
> - int i;
> -
> - for (i = 0; i < ARRAY_SIZE(x); i++)
> - x[i] = state[i];
> -
> - for (i = 0; i < 20; i += 2) {
> - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
> - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
> - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
> - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
> -
> - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
> - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
> - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
> - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
> -
> - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
> - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
> - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
> - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
> -
> - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
> - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
> - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
> - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
> -
> - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
> - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
> - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
> - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
> -
> - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
> - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
> - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
> - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
> -
> - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
> - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
> - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
> - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
> -
> - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
> - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
> - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
> - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
> - }
> -
> - for (i = 0; i < ARRAY_SIZE(x); i++)
> - out[i] = cpu_to_le32(x[i] + state[i]);
> -
> - state[12]++;
> -}
> -
> static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
> unsigned int bytes)
> {
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index b583e53..95f4451 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -260,6 +260,7 @@
> #include <linux/irq.h>
> #include <linux/syscalls.h>
> #include <linux/completion.h>
> +#include <crypto/chacha20.h>
>
> #include <asm/processor.h>
> #include <asm/uaccess.h>
> @@ -412,6 +413,15 @@ static struct fasync_struct *fasync;
> static DEFINE_SPINLOCK(random_ready_list_lock);
> static LIST_HEAD(random_ready_list);
>
> +/*
> + * crng_init = 0 --> Uninitialized
> + * 2 --> Initialized
> + * 3 --> Initialized from input_pool
> + */
> +static int crng_init = 0;

shouldn't that be an atomic_t ?

> +#define crng_ready() (likely(crng_init >= 2))
> +static void process_random_ready_list(void);
> +
> /**********************************************************************
> *
> * OS independent entropy store. Here are the functions which handle
> @@ -441,10 +451,13 @@ struct entropy_store {
> __u8 last_data[EXTRACT_SIZE];
> };
>
> +static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> + size_t nbytes, int min, int rsvd);
> +
> +static int crng_reseed(struct entropy_store *r);
> static void push_to_pool(struct work_struct *work);
> static __u32 input_pool_data[INPUT_POOL_WORDS];
> static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
> -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
>
> static struct entropy_store input_pool = {
> .poolinfo = &poolinfo_table[0],
> @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = {
> push_to_pool),
> };
>
> -static struct entropy_store nonblocking_pool = {
> - .poolinfo = &poolinfo_table[1],
> - .name = "nonblocking",
> - .pull = &input_pool,
> - .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
> - .pool = nonblocking_pool_data,
> - .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
> - push_to_pool),
> -};
> -
> static __u32 const twist_table[8] = {
> 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
> 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
> @@ -677,12 +680,6 @@ retry:
> if (!r->initialized && r->entropy_total > 128) {
> r->initialized = 1;
> r->entropy_total = 0;
> - if (r == &nonblocking_pool) {
> - prandom_reseed_late();
> - process_random_ready_list();
> - wake_up_all(&urandom_init_wait);
> - pr_notice("random: %s pool is initialized\n", r-
>name);
> - }
> }
>
> trace_credit_entropy_bits(r->name, nbits,
> @@ -692,30 +689,27 @@ retry:
> if (r == &input_pool) {
> int entropy_bits = entropy_count >> ENTROPY_SHIFT;
>
> + if (crng_init < 3 && entropy_bits >= 128) {
> + (void) crng_reseed(r);
> + entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
> + }
> +
> /* should we wake readers? */
> if (entropy_bits >= random_read_wakeup_bits) {
> wake_up_interruptible(&random_read_wait);
> kill_fasync(&fasync, SIGIO, POLL_IN);
> }
> /* If the input pool is getting full, send some
> - * entropy to the two output pools, flipping back and
> - * forth between them, until the output pools are 75%
> - * full.
> + * entropy to the blocking pool until it is 75% full.
> */
> if (entropy_bits > random_write_wakeup_bits &&
> r->initialized &&
> r->entropy_total >= 2*random_read_wakeup_bits) {
> - static struct entropy_store *last = &blocking_pool;
> struct entropy_store *other = &blocking_pool;
>
> - if (last == &blocking_pool)
> - other = &nonblocking_pool;
> if (other->entropy_count <=
> - 3 * other->poolinfo->poolfracbits / 4)
> - last = other;
> - if (last->entropy_count <=
> - 3 * last->poolinfo->poolfracbits / 4) {
> - schedule_work(&last->push_work);
> + 3 * other->poolinfo->poolfracbits / 4) {
> + schedule_work(&other->push_work);
> r->entropy_total = 0;
> }
> }
> @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct
> entropy_store *r, int nbits)
>
> /*********************************************************************
> *
> + * CRNG using CHACHA20
> + *
> + *********************************************************************/
> +
> +#define CRNG_RESEED_INTERVAL (300*HZ)
> +
> +struct crng_state {
> + __u32 state[16];
> + unsigned long init_time;
> + spinlock_t lock;
> +};
> +
> +struct crng_state primary_crng = {
> + .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
> +};
> +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
> +
> +static void _initialize_crng(struct crng_state *crng)
> +{
> + int i;
> + unsigned long rv;

Why do you use unsigned long here? I thought the state[i] is unsigned int.
> +
> + memcpy(&crng->state[0], "expand 32-byte k", 16);
> + for (i = 4; i < 16; i++) {
> + if (!arch_get_random_seed_long(&rv) &&
> + !arch_get_random_long(&rv))
> + rv = random_get_entropy();
> + crng->state[i] ^= rv;
> + }
> + crng->init_time = jiffies - CRNG_RESEED_INTERVAL;

Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to
test that the ChaCha20 works?

> +}
> +
> +static void initialize_crng(struct crng_state *crng)
> +{
> + _initialize_crng(crng);
> + spin_lock_init(&crng->lock);
> +}
> +
> +static int crng_fast_load(__u32 pool[4])
> +{
> + int i;
> + __u32 *p;
> +
> + if (!spin_trylock(&primary_crng.lock))
> + return 0;
> + if (crng_ready()) {
> + spin_unlock(&primary_crng.lock);
> + return 0;
> + }
> + p = &primary_crng.state[4];
> + if (crng_init == 1)
> + p += 4;
> + for (i=0; i < 4; i++)
> + *p ^= pool[i];
> + if (crng_init++ >= 2)
> + wake_up_interruptible(&crng_init_wait);

Don't we have a race here with the crng_init < 3 check in crng_reseed
considering multi-core systems?

> + pr_notice("random: crng_init %d\n", crng_init);
> + spin_unlock(&primary_crng.lock);
> + return 1;
> +}
> +
> +/* Returns 1 on success */
> +static int crng_reseed(struct entropy_store *r)
> +{
> + unsigned long flags;
> + int ret = 0;
> + int i, num, num_words;
> + __u32 tmp[16];
> +
> + spin_lock_irqsave(&primary_crng.lock, flags);
> + num = extract_entropy(r, tmp, 32, 16, 0);
> + if (num == 0)
> + goto out;
> + if (num < 16 || num > 32) {
> + WARN_ON(1);
> + pr_err("crng_reseed: num is %d?!?\n", num);
> + }
> + num_words = (num + 3) / 4;
> + for (i = 0; i < num_words; i++)
> + primary_crng.state[i+4] ^= tmp[i];
> + primary_crng.init_time = jiffies;
> + if (crng_init < 3) {

Shouldn't that one be if (crng_init < 3 && num >= 16) ?

> + crng_init = 3;
> + process_random_ready_list();
> + wake_up_interruptible(&crng_init_wait);
> + pr_notice("random: crng_init 3\n");

Would it make sense to be more descriptive here to allow readers of dmesg to
understand the output?

> + }
> + ret = 1;
> +out:
> + spin_unlock_irqrestore(&primary_crng.lock, flags);

memzero_explicit of tmp?

> + return ret;
> +}
> +
> +static inline void crng_wait_ready(void)
> +{
> + wait_event_interruptible(crng_init_wait, crng_ready());
> +}
> +
> +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> +{
> + unsigned long v, flags;
> + struct crng_state *crng = &primary_crng;
> +
> + if (crng_init > 2 &&
> + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> + crng_reseed(&input_pool);
> + spin_lock_irqsave(&crng->lock, flags);
> + if (arch_get_random_long(&v))
> + crng->state[14] ^= v;

Again, unsigned int?

What is the purpose to only cover the 2nd 32 bit value of the nonce with
arch_get_random?

> + chacha20_block(&crng->state[0], out);
> + if (crng->state[12] == 0)
> + crng->state[13]++;

state[12]++? Or why do you increment the nonce?

> + spin_unlock_irqrestore(&crng->lock, flags);
> +}
> +
> +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> +{
> + ssize_t ret = 0, i;
> + __u8 tmp[CHACHA20_BLOCK_SIZE];
> + int large_request = (nbytes > 256);
> +
> + while (nbytes) {
> + if (large_request && need_resched()) {
> + if (signal_pending(current)) {
> + if (ret == 0)
> + ret = -ERESTARTSYS;
> + break;
> + }
> + schedule();
> + }
> +
> + extract_crng(tmp);

Now I have to wear my (ugly) FIPS heat: we need that code from the current
implementation here:

if (fips_enabled) {
spin_lock_irqsave(&r->lock, flags);
if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
panic("Hardware RNG duplicated output!\n");
memcpy(r->last_data, tmp, EXTRACT_SIZE);
spin_unlock_irqrestore(&r->lock, flags);
}

> + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> + if (copy_to_user(buf, tmp, i)) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + nbytes -= i;
> + buf += i;
> + ret += i;
> + }
> +
> + /* Wipe data just written to memory */
> + memzero_explicit(tmp, sizeof(tmp));
> +
> + return ret;
> +}
> +
> +
> +/*********************************************************************
> + *
> * Entropy input management
> *
> *********************************************************************/
> @@ -749,12 +895,12 @@ struct timer_rand_state {
> #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
>
> /*
> - * Add device- or boot-specific data to the input and nonblocking
> - * pools to help initialize them to unique values.
> + * Add device- or boot-specific data to the input pool to help
> + * initialize it.
> *
> - * None of this adds any entropy, it is meant to avoid the
> - * problem of the nonblocking pool having similar initial state
> - * across largely identical devices.
> + * None of this adds any entropy; it is meant to avoid the problem of
> + * the entropy pool having similar initial state across largely
> + * identical devices.
> */
> void add_device_randomness(const void *buf, unsigned int size)
> {
> @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned
> int size) _mix_pool_bytes(&input_pool, buf, size);
> _mix_pool_bytes(&input_pool, &time, sizeof(time));
> spin_unlock_irqrestore(&input_pool.lock, flags);
> -
> - spin_lock_irqsave(&nonblocking_pool.lock, flags);
> - _mix_pool_bytes(&nonblocking_pool, buf, size);
> - _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
> - spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
> }
> EXPORT_SYMBOL(add_device_randomness);
>
> @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state
> *state, unsigned num) sample.jiffies = jiffies;
> sample.cycles = random_get_entropy();
> sample.num = num;
> - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> + r = &input_pool;
> mix_pool_bytes(r, &sample, sizeof(sample));
>
> /*
> @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags)
> !time_after(now, fast_pool->last + HZ))
> return;
>
> - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> + if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
> + fast_pool->count = 0;
> + fast_pool->last = now;
> + return;
> + }
> +
> + r = &input_pool;
> if (!spin_trylock(&r->lock))
> return;
>
> @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
> *
> *********************************************************************/
>
> -static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> - size_t nbytes, int min, int rsvd);
> -
> /*
> * This utility inline function is responsible for transferring entropy
> * from the primary pool to the secondary extraction pool. We make
> @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct
> entropy_store *r, void __user *buf, */
> void get_random_bytes(void *buf, int nbytes)
> {
> + __u8 tmp[CHACHA20_BLOCK_SIZE];
> +
> #if DEBUG_RANDOM_BOOT > 0
> - if (unlikely(nonblocking_pool.initialized == 0))
> + if (!crng_ready())
> printk(KERN_NOTICE "random: %pF get_random_bytes called "
> - "with %d bits of entropy available\n",
> - (void *) _RET_IP_,
> - nonblocking_pool.entropy_total);
> + "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
> #endif
> trace_get_random_bytes(nbytes, _RET_IP_);
> - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
> +
> + while (nbytes >= CHACHA20_BLOCK_SIZE) {
> + extract_crng(buf);
> + buf += CHACHA20_BLOCK_SIZE;
> + nbytes -= CHACHA20_BLOCK_SIZE;
> + }
> +
> + if (nbytes > 0) {
> + extract_crng(tmp);
> + memcpy(buf, tmp, nbytes);
> + memzero_explicit(tmp, nbytes);
> + }
> }
> EXPORT_SYMBOL(get_random_bytes);
>
> @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) unsigned long flags;
> int err = -EALREADY;
>
> - if (likely(nonblocking_pool.initialized))
> + if (crng_ready())
> return err;
>
> owner = rdy->owner;
> @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) return -ENOENT;
>
> spin_lock_irqsave(&random_ready_list_lock, flags);
> - if (nonblocking_pool.initialized)
> + if (crng_ready())
> goto out;
>
> owner = NULL;
> @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes)
> }
>
> if (nbytes)
> - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
> + get_random_bytes(p, nbytes);
> }
> EXPORT_SYMBOL(get_random_bytes_arch);
>
> @@ -1395,7 +1550,7 @@ static int rand_initialize(void)
> {
> init_std_data(&input_pool);
> init_std_data(&blocking_pool);
> - init_std_data(&nonblocking_pool);
> + _initialize_crng(&primary_crng);
> return 0;
> }
> early_initcall(rand_initialize);
> @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf,
> size_t nbytes, loff_t *ppos) {
> int ret;
>
> - if (unlikely(nonblocking_pool.initialized == 0))
> - printk_once(KERN_NOTICE "random: %s urandom read "
> - "with %d bits of entropy available\n",
> - current->comm, nonblocking_pool.entropy_total);
> -
> + crng_wait_ready();

Just for clarification: are you now blocking /dev/urandom until the CRNG is
filled? That would be a big win.

> nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
> - ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
> -
> - trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
> - ENTROPY_BITS(&input_pool));
> + ret = extract_crng_user(buf, nbytes);
> + trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
> return ret;
> }
>
> @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const
> char __user *buffer, {
> size_t ret;
>
> - ret = write_pool(&blocking_pool, buffer, count);
> - if (ret)
> - return ret;
> - ret = write_pool(&nonblocking_pool, buffer, count);
> + ret = write_pool(&input_pool, buffer, count);
> if (ret)
> return ret;
>
> @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int
> cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> input_pool.entropy_count = 0;
> - nonblocking_pool.entropy_count = 0;
> blocking_pool.entropy_count = 0;
> return 0;
> default:
> @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf,
> size_t, count, if (flags & GRND_RANDOM)
> return _random_read(flags & GRND_NONBLOCK, buf, count);
>
> - if (unlikely(nonblocking_pool.initialized == 0)) {
> + if (!crng_ready()) {
> if (flags & GRND_NONBLOCK)
> return -EAGAIN;
> - wait_event_interruptible(urandom_init_wait,
> - nonblocking_pool.initialized);
> + crng_wait_ready();
> if (signal_pending(current))
> return -ERESTARTSYS;
> }
> diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
> index 274bbae..20d20f68 100644
> --- a/include/crypto/chacha20.h
> +++ b/include/crypto/chacha20.h
> @@ -16,6 +16,7 @@ struct chacha20_ctx {
> u32 key[8];
> };
>
> +void chacha20_block(u32 *state, void *stream);
> void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
> int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
> unsigned int keysize);
> diff --git a/lib/Makefile b/lib/Makefile
> index 7bd6fd4..9ba27cd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> rbtree.o radix-tree.o dump_stack.o timerqueue.o\
> idr.o int_sqrt.o extable.o \
> - sha1.o md5.o irq_regs.o argv_split.o \
> + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
> proportions.o flex_proportions.o ratelimit.o show_mem.o \
> is_single_threaded.o plist.o decompress.o kobject_uevent.o \
> earlycpio.o seq_buf.o nmi_backtrace.o
> diff --git a/lib/chacha20.c b/lib/chacha20.c
> new file mode 100644
> index 0000000..250ceed
> --- /dev/null
> +++ b/lib/chacha20.c
> @@ -0,0 +1,79 @@
> +/*
> + * ChaCha20 256-bit cipher algorithm, RFC7539
> + *
> + * Copyright (C) 2015 Martin Willi
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/export.h>
> +#include <linux/bitops.h>
> +#include <linux/cryptohash.h>
> +#include <asm/unaligned.h>
> +#include <crypto/chacha20.h>
> +
> +static inline u32 rotl32(u32 v, u8 n)
> +{
> + return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}
> +
> +extern void chacha20_block(u32 *state, void *stream)
> +{
> + u32 x[16], *out = stream;
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(x); i++)
> + x[i] = state[i];
> +
> + for (i = 0; i < 20; i += 2) {
> + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
> + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
> + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
> + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
> +
> + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
> + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
> + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
> + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
> +
> + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
> + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
> + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
> + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
> +
> + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
> + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
> + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
> + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
> +
> + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
> + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
> + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
> + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
> +
> + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
> + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
> + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
> + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
> +
> + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
> + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
> + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
> + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
> +
> + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
> + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
> + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
> + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(x); i++)
> + out[i] = cpu_to_le32(x[i] + state[i]);
> +
> + state[12]++;
> +}
> +EXPORT_SYMBOL(chacha20_block);

Ciao
Stephan

2016-05-03 09:36:17

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

Am Montag, 2. Mai 2016, 02:26:51 schrieb Theodore Ts'o:

Hi Theodore,

One more item.

> The CRNG is faster, and we don't pretend to track entropy usage in the
> CRNG any more.
>
> Signed-off-by: Theodore Ts'o <[email protected]>
> ---
> crypto/chacha20_generic.c | 61 ----------
> drivers/char/random.c | 282
> ++++++++++++++++++++++++++++++++++------------ include/crypto/chacha20.h |
> 1 +
> lib/Makefile | 2 +-
> lib/chacha20.c | 79 +++++++++++++
> 5 files changed, 294 insertions(+), 131 deletions(-)
> create mode 100644 lib/chacha20.c
>
> diff --git a/crypto/chacha20_generic.c b/crypto/chacha20_generic.c
> index da9c899..1cab831 100644
> --- a/crypto/chacha20_generic.c
> +++ b/crypto/chacha20_generic.c
> @@ -15,72 +15,11 @@
> #include <linux/module.h>
> #include <crypto/chacha20.h>
>
> -static inline u32 rotl32(u32 v, u8 n)
> -{
> - return (v << n) | (v >> (sizeof(v) * 8 - n));
> -}
> -
> static inline u32 le32_to_cpuvp(const void *p)
> {
> return le32_to_cpup(p);
> }
>
> -static void chacha20_block(u32 *state, void *stream)
> -{
> - u32 x[16], *out = stream;
> - int i;
> -
> - for (i = 0; i < ARRAY_SIZE(x); i++)
> - x[i] = state[i];
> -
> - for (i = 0; i < 20; i += 2) {
> - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
> - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
> - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
> - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
> -
> - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
> - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
> - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
> - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
> -
> - x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
> - x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
> - x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
> - x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
> -
> - x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
> - x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
> - x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
> - x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
> -
> - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
> - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
> - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
> - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
> -
> - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
> - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
> - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
> - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
> -
> - x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
> - x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
> - x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
> - x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
> -
> - x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
> - x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
> - x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
> - x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
> - }
> -
> - for (i = 0; i < ARRAY_SIZE(x); i++)
> - out[i] = cpu_to_le32(x[i] + state[i]);
> -
> - state[12]++;
> -}
> -
> static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src,
> unsigned int bytes)
> {
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index b583e53..95f4451 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -260,6 +260,7 @@
> #include <linux/irq.h>
> #include <linux/syscalls.h>
> #include <linux/completion.h>
> +#include <crypto/chacha20.h>
>
> #include <asm/processor.h>
> #include <asm/uaccess.h>
> @@ -412,6 +413,15 @@ static struct fasync_struct *fasync;
> static DEFINE_SPINLOCK(random_ready_list_lock);
> static LIST_HEAD(random_ready_list);
>
> +/*
> + * crng_init = 0 --> Uninitialized
> + * 2 --> Initialized
> + * 3 --> Initialized from input_pool
> + */
> +static int crng_init = 0;
> +#define crng_ready() (likely(crng_init >= 2))
> +static void process_random_ready_list(void);
> +
> /**********************************************************************
> *
> * OS independent entropy store. Here are the functions which handle
> @@ -441,10 +451,13 @@ struct entropy_store {
> __u8 last_data[EXTRACT_SIZE];
> };
>
> +static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> + size_t nbytes, int min, int rsvd);
> +
> +static int crng_reseed(struct entropy_store *r);
> static void push_to_pool(struct work_struct *work);
> static __u32 input_pool_data[INPUT_POOL_WORDS];
> static __u32 blocking_pool_data[OUTPUT_POOL_WORDS];
> -static __u32 nonblocking_pool_data[OUTPUT_POOL_WORDS];
>
> static struct entropy_store input_pool = {
> .poolinfo = &poolinfo_table[0],
> @@ -465,16 +478,6 @@ static struct entropy_store blocking_pool = {
> push_to_pool),
> };
>
> -static struct entropy_store nonblocking_pool = {
> - .poolinfo = &poolinfo_table[1],
> - .name = "nonblocking",
> - .pull = &input_pool,
> - .lock = __SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
> - .pool = nonblocking_pool_data,
> - .push_work = __WORK_INITIALIZER(nonblocking_pool.push_work,
> - push_to_pool),
> -};
> -
> static __u32 const twist_table[8] = {
> 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
> 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
> @@ -677,12 +680,6 @@ retry:
> if (!r->initialized && r->entropy_total > 128) {
> r->initialized = 1;
> r->entropy_total = 0;
> - if (r == &nonblocking_pool) {
> - prandom_reseed_late();
> - process_random_ready_list();
> - wake_up_all(&urandom_init_wait);
> - pr_notice("random: %s pool is initialized\n", r-
>name);
> - }
> }
>
> trace_credit_entropy_bits(r->name, nbits,
> @@ -692,30 +689,27 @@ retry:
> if (r == &input_pool) {
> int entropy_bits = entropy_count >> ENTROPY_SHIFT;
>
> + if (crng_init < 3 && entropy_bits >= 128) {
> + (void) crng_reseed(r);
> + entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
> + }
> +
> /* should we wake readers? */
> if (entropy_bits >= random_read_wakeup_bits) {
> wake_up_interruptible(&random_read_wait);
> kill_fasync(&fasync, SIGIO, POLL_IN);
> }
> /* If the input pool is getting full, send some
> - * entropy to the two output pools, flipping back and
> - * forth between them, until the output pools are 75%
> - * full.
> + * entropy to the blocking pool until it is 75% full.
> */
> if (entropy_bits > random_write_wakeup_bits &&
> r->initialized &&
> r->entropy_total >= 2*random_read_wakeup_bits) {
> - static struct entropy_store *last = &blocking_pool;
> struct entropy_store *other = &blocking_pool;
>
> - if (last == &blocking_pool)
> - other = &nonblocking_pool;
> if (other->entropy_count <=
> - 3 * other->poolinfo->poolfracbits / 4)
> - last = other;
> - if (last->entropy_count <=
> - 3 * last->poolinfo->poolfracbits / 4) {
> - schedule_work(&last->push_work);
> + 3 * other->poolinfo->poolfracbits / 4) {
> + schedule_work(&other->push_work);
> r->entropy_total = 0;
> }
> }
> @@ -735,6 +729,158 @@ static void credit_entropy_bits_safe(struct
> entropy_store *r, int nbits)
>
> /*********************************************************************
> *
> + * CRNG using CHACHA20
> + *
> + *********************************************************************/
> +
> +#define CRNG_RESEED_INTERVAL (300*HZ)
> +
> +struct crng_state {
> + __u32 state[16];
> + unsigned long init_time;
> + spinlock_t lock;
> +};
> +
> +struct crng_state primary_crng = {
> + .lock = __SPIN_LOCK_UNLOCKED(primary_crng.lock),
> +};
> +static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
> +
> +static void _initialize_crng(struct crng_state *crng)
> +{
> + int i;
> + unsigned long rv;
> +
> + memcpy(&crng->state[0], "expand 32-byte k", 16);
> + for (i = 4; i < 16; i++) {
> + if (!arch_get_random_seed_long(&rv) &&
> + !arch_get_random_long(&rv))
> + rv = random_get_entropy();
> + crng->state[i] ^= rv;
> + }
> + crng->init_time = jiffies - CRNG_RESEED_INTERVAL;
> +}
> +
> +static void initialize_crng(struct crng_state *crng)
> +{
> + _initialize_crng(crng);
> + spin_lock_init(&crng->lock);
> +}
> +
> +static int crng_fast_load(__u32 pool[4])
> +{
> + int i;
> + __u32 *p;
> +
> + if (!spin_trylock(&primary_crng.lock))
> + return 0;
> + if (crng_ready()) {
> + spin_unlock(&primary_crng.lock);
> + return 0;
> + }
> + p = &primary_crng.state[4];
> + if (crng_init == 1)
> + p += 4;
> + for (i=0; i < 4; i++)
> + *p ^= pool[i];
> + if (crng_init++ >= 2)
> + wake_up_interruptible(&crng_init_wait);
> + pr_notice("random: crng_init %d\n", crng_init);
> + spin_unlock(&primary_crng.lock);
> + return 1;
> +}
> +
> +/* Returns 1 on success */
> +static int crng_reseed(struct entropy_store *r)
> +{
> + unsigned long flags;
> + int ret = 0;
> + int i, num, num_words;
> + __u32 tmp[16];
> +
> + spin_lock_irqsave(&primary_crng.lock, flags);
> + num = extract_entropy(r, tmp, 32, 16, 0);
> + if (num == 0)
> + goto out;
> + if (num < 16 || num > 32) {
> + WARN_ON(1);
> + pr_err("crng_reseed: num is %d?!?\n", num);
> + }
> + num_words = (num + 3) / 4;
> + for (i = 0; i < num_words; i++)
> + primary_crng.state[i+4] ^= tmp[i];
> + primary_crng.init_time = jiffies;
> + if (crng_init < 3) {
> + crng_init = 3;
> + process_random_ready_list();
> + wake_up_interruptible(&crng_init_wait);
> + pr_notice("random: crng_init 3\n");
> + }
> + ret = 1;
> +out:
> + spin_unlock_irqrestore(&primary_crng.lock, flags);
> + return ret;
> +}
> +
> +static inline void crng_wait_ready(void)
> +{
> + wait_event_interruptible(crng_init_wait, crng_ready());
> +}
> +
> +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> +{
> + unsigned long v, flags;
> + struct crng_state *crng = &primary_crng;
> +
> + if (crng_init > 2 &&
> + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> + crng_reseed(&input_pool);
> + spin_lock_irqsave(&crng->lock, flags);
> + if (arch_get_random_long(&v))
> + crng->state[14] ^= v;
> + chacha20_block(&crng->state[0], out);
> + if (crng->state[12] == 0)
> + crng->state[13]++;
> + spin_unlock_irqrestore(&crng->lock, flags);
> +}
> +
> +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> +{
> + ssize_t ret = 0, i;
> + __u8 tmp[CHACHA20_BLOCK_SIZE];
> + int large_request = (nbytes > 256);
> +
> + while (nbytes) {
> + if (large_request && need_resched()) {
> + if (signal_pending(current)) {
> + if (ret == 0)
> + ret = -ERESTARTSYS;
> + break;
> + }
> + schedule();
> + }
> +
> + extract_crng(tmp);
> + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> + if (copy_to_user(buf, tmp, i)) {
> + ret = -EFAULT;
> + break;
> + }
> +
> + nbytes -= i;
> + buf += i;
> + ret += i;
> + }
> +
> + /* Wipe data just written to memory */
> + memzero_explicit(tmp, sizeof(tmp));

Would it make sense to add another chacha20_block() call here at the end?
Note, the one thing about the SP800-90A DRBG I really like is the enhanced
backward secrecy support which is implemented by "updating" the internal state
(the key / state) used for one or more random number generation rounds after
one request for random numbers is satisfied.

This means that even if the state becomes known or the subsequent caller
manages to deduce the state of the RNG to some degree of confidence, he cannot
backtrack the already generated random numbers.

I see that the ChaCha20 RNG implicitly updates its state while it operates.
But for the last round of the RNG, there is no more shuffling of the internal
state. As one round is 64 bytes in size and many callers just want 16 or 32
bytes (as seen during testing), a lot of callers trigger only one round of the
RNG.

> +
> + return ret;
> +}
> +
> +
> +/*********************************************************************
> + *
> * Entropy input management
> *
> *********************************************************************/
> @@ -749,12 +895,12 @@ struct timer_rand_state {
> #define INIT_TIMER_RAND_STATE { INITIAL_JIFFIES, };
>
> /*
> - * Add device- or boot-specific data to the input and nonblocking
> - * pools to help initialize them to unique values.
> + * Add device- or boot-specific data to the input pool to help
> + * initialize it.
> *
> - * None of this adds any entropy, it is meant to avoid the
> - * problem of the nonblocking pool having similar initial state
> - * across largely identical devices.
> + * None of this adds any entropy; it is meant to avoid the problem of
> + * the entropy pool having similar initial state across largely
> + * identical devices.
> */
> void add_device_randomness(const void *buf, unsigned int size)
> {
> @@ -766,11 +912,6 @@ void add_device_randomness(const void *buf, unsigned
> int size) _mix_pool_bytes(&input_pool, buf, size);
> _mix_pool_bytes(&input_pool, &time, sizeof(time));
> spin_unlock_irqrestore(&input_pool.lock, flags);
> -
> - spin_lock_irqsave(&nonblocking_pool.lock, flags);
> - _mix_pool_bytes(&nonblocking_pool, buf, size);
> - _mix_pool_bytes(&nonblocking_pool, &time, sizeof(time));
> - spin_unlock_irqrestore(&nonblocking_pool.lock, flags);
> }
> EXPORT_SYMBOL(add_device_randomness);
>
> @@ -801,7 +942,7 @@ static void add_timer_randomness(struct timer_rand_state
> *state, unsigned num) sample.jiffies = jiffies;
> sample.cycles = random_get_entropy();
> sample.num = num;
> - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> + r = &input_pool;
> mix_pool_bytes(r, &sample, sizeof(sample));
>
> /*
> @@ -921,7 +1062,13 @@ void add_interrupt_randomness(int irq, int irq_flags)
> !time_after(now, fast_pool->last + HZ))
> return;
>
> - r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool;
> + if (!crng_ready() && crng_fast_load(fast_pool->pool)) {
> + fast_pool->count = 0;
> + fast_pool->last = now;
> + return;
> + }
> +
> + r = &input_pool;
> if (!spin_trylock(&r->lock))
> return;
>
> @@ -964,9 +1111,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
> *
> *********************************************************************/
>
> -static ssize_t extract_entropy(struct entropy_store *r, void *buf,
> - size_t nbytes, int min, int rsvd);
> -
> /*
> * This utility inline function is responsible for transferring entropy
> * from the primary pool to the secondary extraction pool. We make
> @@ -1252,15 +1396,26 @@ static ssize_t extract_entropy_user(struct
> entropy_store *r, void __user *buf, */
> void get_random_bytes(void *buf, int nbytes)
> {
> + __u8 tmp[CHACHA20_BLOCK_SIZE];
> +
> #if DEBUG_RANDOM_BOOT > 0
> - if (unlikely(nonblocking_pool.initialized == 0))
> + if (!crng_ready())
> printk(KERN_NOTICE "random: %pF get_random_bytes called "
> - "with %d bits of entropy available\n",
> - (void *) _RET_IP_,
> - nonblocking_pool.entropy_total);
> + "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
> #endif
> trace_get_random_bytes(nbytes, _RET_IP_);
> - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
> +
> + while (nbytes >= CHACHA20_BLOCK_SIZE) {
> + extract_crng(buf);
> + buf += CHACHA20_BLOCK_SIZE;
> + nbytes -= CHACHA20_BLOCK_SIZE;
> + }
> +
> + if (nbytes > 0) {
> + extract_crng(tmp);
> + memcpy(buf, tmp, nbytes);
> + memzero_explicit(tmp, nbytes);
> + }

dto here.

> }
> EXPORT_SYMBOL(get_random_bytes);
>
> @@ -1278,7 +1433,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) unsigned long flags;
> int err = -EALREADY;
>
> - if (likely(nonblocking_pool.initialized))
> + if (crng_ready())
> return err;
>
> owner = rdy->owner;
> @@ -1286,7 +1441,7 @@ int add_random_ready_callback(struct
> random_ready_callback *rdy) return -ENOENT;
>
> spin_lock_irqsave(&random_ready_list_lock, flags);
> - if (nonblocking_pool.initialized)
> + if (crng_ready())
> goto out;
>
> owner = NULL;
> @@ -1350,7 +1505,7 @@ void get_random_bytes_arch(void *buf, int nbytes)
> }
>
> if (nbytes)
> - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0);
> + get_random_bytes(p, nbytes);
> }
> EXPORT_SYMBOL(get_random_bytes_arch);
>
> @@ -1395,7 +1550,7 @@ static int rand_initialize(void)
> {
> init_std_data(&input_pool);
> init_std_data(&blocking_pool);
> - init_std_data(&nonblocking_pool);
> + _initialize_crng(&primary_crng);
> return 0;
> }
> early_initcall(rand_initialize);
> @@ -1459,16 +1614,10 @@ urandom_read(struct file *file, char __user *buf,
> size_t nbytes, loff_t *ppos) {
> int ret;
>
> - if (unlikely(nonblocking_pool.initialized == 0))
> - printk_once(KERN_NOTICE "random: %s urandom read "
> - "with %d bits of entropy available\n",
> - current->comm, nonblocking_pool.entropy_total);
> -
> + crng_wait_ready();
> nbytes = min_t(size_t, nbytes, INT_MAX >> (ENTROPY_SHIFT + 3));
> - ret = extract_entropy_user(&nonblocking_pool, buf, nbytes);
> -
> - trace_urandom_read(8 * nbytes, ENTROPY_BITS(&nonblocking_pool),
> - ENTROPY_BITS(&input_pool));
> + ret = extract_crng_user(buf, nbytes);
> + trace_urandom_read(8 * nbytes, 0, ENTROPY_BITS(&input_pool));
> return ret;
> }
>
> @@ -1514,10 +1663,7 @@ static ssize_t random_write(struct file *file, const
> char __user *buffer, {
> size_t ret;
>
> - ret = write_pool(&blocking_pool, buffer, count);
> - if (ret)
> - return ret;
> - ret = write_pool(&nonblocking_pool, buffer, count);
> + ret = write_pool(&input_pool, buffer, count);
> if (ret)
> return ret;
>
> @@ -1568,7 +1714,6 @@ static long random_ioctl(struct file *f, unsigned int
> cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN))
> return -EPERM;
> input_pool.entropy_count = 0;
> - nonblocking_pool.entropy_count = 0;
> blocking_pool.entropy_count = 0;
> return 0;
> default:
> @@ -1610,11 +1755,10 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf,
> size_t, count, if (flags & GRND_RANDOM)
> return _random_read(flags & GRND_NONBLOCK, buf, count);
>
> - if (unlikely(nonblocking_pool.initialized == 0)) {
> + if (!crng_ready()) {
> if (flags & GRND_NONBLOCK)
> return -EAGAIN;
> - wait_event_interruptible(urandom_init_wait,
> - nonblocking_pool.initialized);
> + crng_wait_ready();
> if (signal_pending(current))
> return -ERESTARTSYS;
> }
> diff --git a/include/crypto/chacha20.h b/include/crypto/chacha20.h
> index 274bbae..20d20f68 100644
> --- a/include/crypto/chacha20.h
> +++ b/include/crypto/chacha20.h
> @@ -16,6 +16,7 @@ struct chacha20_ctx {
> u32 key[8];
> };
>
> +void chacha20_block(u32 *state, void *stream);
> void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv);
> int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
> unsigned int keysize);
> diff --git a/lib/Makefile b/lib/Makefile
> index 7bd6fd4..9ba27cd 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -22,7 +22,7 @@ KCOV_INSTRUMENT_hweight.o := n
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> rbtree.o radix-tree.o dump_stack.o timerqueue.o\
> idr.o int_sqrt.o extable.o \
> - sha1.o md5.o irq_regs.o argv_split.o \
> + sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
> proportions.o flex_proportions.o ratelimit.o show_mem.o \
> is_single_threaded.o plist.o decompress.o kobject_uevent.o \
> earlycpio.o seq_buf.o nmi_backtrace.o
> diff --git a/lib/chacha20.c b/lib/chacha20.c
> new file mode 100644
> index 0000000..250ceed
> --- /dev/null
> +++ b/lib/chacha20.c
> @@ -0,0 +1,79 @@
> +/*
> + * ChaCha20 256-bit cipher algorithm, RFC7539
> + *
> + * Copyright (C) 2015 Martin Willi
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/export.h>
> +#include <linux/bitops.h>
> +#include <linux/cryptohash.h>
> +#include <asm/unaligned.h>
> +#include <crypto/chacha20.h>
> +
> +static inline u32 rotl32(u32 v, u8 n)
> +{
> + return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}
> +
> +extern void chacha20_block(u32 *state, void *stream)
> +{
> + u32 x[16], *out = stream;
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(x); i++)
> + x[i] = state[i];
> +
> + for (i = 0; i < 20; i += 2) {
> + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 16);
> + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 16);
> + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 16);
> + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 16);
> +
> + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 12);
> + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 12);
> + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 12);
> + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 12);
> +
> + x[0] += x[4]; x[12] = rotl32(x[12] ^ x[0], 8);
> + x[1] += x[5]; x[13] = rotl32(x[13] ^ x[1], 8);
> + x[2] += x[6]; x[14] = rotl32(x[14] ^ x[2], 8);
> + x[3] += x[7]; x[15] = rotl32(x[15] ^ x[3], 8);
> +
> + x[8] += x[12]; x[4] = rotl32(x[4] ^ x[8], 7);
> + x[9] += x[13]; x[5] = rotl32(x[5] ^ x[9], 7);
> + x[10] += x[14]; x[6] = rotl32(x[6] ^ x[10], 7);
> + x[11] += x[15]; x[7] = rotl32(x[7] ^ x[11], 7);
> +
> + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 16);
> + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 16);
> + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 16);
> + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 16);
> +
> + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 12);
> + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 12);
> + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 12);
> + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 12);
> +
> + x[0] += x[5]; x[15] = rotl32(x[15] ^ x[0], 8);
> + x[1] += x[6]; x[12] = rotl32(x[12] ^ x[1], 8);
> + x[2] += x[7]; x[13] = rotl32(x[13] ^ x[2], 8);
> + x[3] += x[4]; x[14] = rotl32(x[14] ^ x[3], 8);
> +
> + x[10] += x[15]; x[5] = rotl32(x[5] ^ x[10], 7);
> + x[11] += x[12]; x[6] = rotl32(x[6] ^ x[11], 7);
> + x[8] += x[13]; x[7] = rotl32(x[7] ^ x[8], 7);
> + x[9] += x[14]; x[4] = rotl32(x[4] ^ x[9], 7);
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(x); i++)
> + out[i] = cpu_to_le32(x[i] + state[i]);
> +
> + state[12]++;
> +}
> +EXPORT_SYMBOL(chacha20_block);

Ciao
Stephan

2016-05-04 06:24:31

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

Am Dienstag, 3. Mai 2016, 11:36:12 schrieb Stephan Mueller:

Hi Ted,

> > +
> > +static ssize_t extract_crng_user(void __user *buf, size_t nbytes)
> > +{
> > + ssize_t ret = 0, i;
> > + __u8 tmp[CHACHA20_BLOCK_SIZE];
> > + int large_request = (nbytes > 256);
> > +
> > + while (nbytes) {
> > + if (large_request && need_resched()) {
> > + if (signal_pending(current)) {
> > + if (ret == 0)
> > + ret = -ERESTARTSYS;
> > + break;
> > + }
> > + schedule();
> > + }
> > +
> > + extract_crng(tmp);
> > + i = min_t(int, nbytes, CHACHA20_BLOCK_SIZE);
> > + if (copy_to_user(buf, tmp, i)) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > +
> > + nbytes -= i;
> > + buf += i;
> > + ret += i;
> > + }
> > +
> > + /* Wipe data just written to memory */
> > + memzero_explicit(tmp, sizeof(tmp));
>
> Would it make sense to add another chacha20_block() call here at the end?
> Note, the one thing about the SP800-90A DRBG I really like is the enhanced
> backward secrecy support which is implemented by "updating" the internal
> state (the key / state) used for one or more random number generation
> rounds after one request for random numbers is satisfied.
>
> This means that even if the state becomes known or the subsequent caller
> manages to deduce the state of the RNG to some degree of confidence, he
> cannot backtrack the already generated random numbers.
>
> I see that the ChaCha20 RNG implicitly updates its state while it operates.
> But for the last round of the RNG, there is no more shuffling of the
> internal state. As one round is 64 bytes in size and many callers just want
> 16 or 32 bytes (as seen during testing), a lot of callers trigger only one
> round of the RNG.

After doing some performance tests, I see that we reach a performance of north
of 200 MB/s on my system (compare that to 12 MB/s for the SHA-1 version).

Thus, I would assume adding another call to chacha20_block should not hurt.

Ciao
Stephan

2016-05-04 14:40:21

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

> +static inline u32 rotl32(u32 v, u8 n)
> +{
> + return (v << n) | (v >> (sizeof(v) * 8 - n));
> +}

That's undefined behavior when n=0.

I think the portable way to do a rotate that avoids UB is the
following. GCC, Clang and ICC recognize the pattern, and emit a rotate
instruction.

static const unsigned int MASK=31;
return (v<<n)|(v>>(-n&MASK));

You should also avoid the following because its not constant time due
to the branch:

return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));

Jeff

2016-05-04 16:54:14

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

>> + chacha20_block(&crng->state[0], out);
>> + if (crng->state[12] == 0)
>> + crng->state[13]++;
>
> state[12]++? Or why do you increment the nonce?

In Bernstein's Salsa and ChaCha, the counter is 64-bit. It appears
ChaCha-TLS uses a 32-bit counter, and the other 32-bits is given to
the nonce.

Maybe the first question to ask is, what ChaCha is the kernel
providing? If its ChaCha-TLS, then the carry does not make a lot of
sense.

If the generator is limiting the amount of material under a given set
of security parameters (key and nonce), then the generator will likely
re-key itself long before the 256-GB induced wrap. In this case, it
does not matter which ChaCha the kernel is providing and the carry is
superfluous.

Jeff

2016-05-04 17:30:41

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote:
> > +/*
> > + * crng_init = 0 --> Uninitialized
> > + * 2 --> Initialized
> > + * 3 --> Initialized from input_pool
> > + */
> > +static int crng_init = 0;
>
> shouldn't that be an atomic_t ?

The crng_init variable only gets incremented (it progresses from
0->1->2->3) and it's protected by the primary_crng->lock spinlock.
There are a few places where we read it without first taking the lock,
but that's where we depend on it getting incremented and where if we
race with another CPU which has just bumped the crng_init status, it's
not critical. (Or we can take the lock and then recheck the crng_init
if we really need to be sure.)

> > +static void _initialize_crng(struct crng_state *crng)
> > +{
> > + int i;
> > + unsigned long rv;
>
> Why do you use unsigned long here? I thought the state[i] is unsigned int.

Because it gets filled in via arch_get_random_seed_long(&rv) and
arch_get_random_log(&rv). If that means we get 64 bits and we only
use 32 bits, that's OK....

> Would it make sense to add the ChaCha20 self test vectors from RFC7539 here to
> test that the ChaCha20 works?

We're not doing that for any of the other ciphers and hashes. We
didn't do that for SHA1, and the chacha20 code where I took this from
didn't check for this as well. What threat are you most worried
about. We don't care about chacha20 being exactly chacha20, so long
as it's cryptographically strong. In fact I think I removed a
potential host order byteswap in the set key operation specifically
because we don't care and interop.

If this is required for FIPS mode, we can add that later. I was
primarily trying to keep the patch small to be easier for people to
look at it, so I've deliberately left off bells and whistles that
aren't strictly needed to show that the new approach is sound.

> > + if (crng_init++ >= 2)
> > + wake_up_interruptible(&crng_init_wait);
>
> Don't we have a race here with the crng_init < 3 check in crng_reseed
> considering multi-core systems?

No, because we are holding the primary_crng->lock spinlock. I'll add
a comment explaining the locking protections which is intended for
crng_init where we declare it.

> > + if (num < 16 || num > 32) {
> > + WARN_ON(1);
> > + pr_err("crng_reseed: num is %d?!?\n", num);
> > + }
> > + num_words = (num + 3) / 4;
> > + for (i = 0; i < num_words; i++)
> > + primary_crng.state[i+4] ^= tmp[i];
> > + primary_crng.init_time = jiffies;
> > + if (crng_init < 3) {
>
> Shouldn't that one be if (crng_init < 3 && num >= 16) ?

I'll just change the above WRN_ON test to be:

BUG_ON(num < 16 || num > 32);

This really can't happen, and I had it as a WARN_ON with a printk for
debugging purpose in case I was wrong about how the code works.

> > + crng_init = 3;
> > + process_random_ready_list();
> > + wake_up_interruptible(&crng_init_wait);
> > + pr_notice("random: crng_init 3\n");
>
> Would it make sense to be more descriptive here to allow readers of dmesg to
> understand the output?

Yes, what we're currently printing during the initialization:

random: crng_init 1
random: crng_init 2
random: crng_init 3

was again mostly for debugging purposes. What I'm thinking about
doing is changing crng_init 2 and 3 messages to be:

random: crng fast init done
random: crng conservative init done

> > + }
> > + ret = 1;
> > +out:
> > + spin_unlock_irqrestore(&primary_crng.lock, flags);
>
> memzero_explicit of tmp?

Good point, I've added a call to memzero_explicit().

> > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
> > +{
> > + unsigned long v, flags;
> > + struct crng_state *crng = &primary_crng;
> > +
> > + if (crng_init > 2 &&
> > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
> > + crng_reseed(&input_pool);
> > + spin_lock_irqsave(&crng->lock, flags);
> > + if (arch_get_random_long(&v))
> > + crng->state[14] ^= v;
> > + chacha20_block(&crng->state[0], out);
> > + if (crng->state[12] == 0)
> > + crng->state[13]++;

> What is the purpose to only cover the 2nd 32 bit value of the nonce with
> arch_get_random?
>
> state[12]++? Or why do you increment the nonce?

In Chacha20, its output is a funcrtion of the state array, where
state[0..3] is a constant specified by the Chacha20 definition,
state[4..11] is the Key, and state[12..15] is the IV. The
chacha20_block() function increments state[12] each time it is called,
so state[12] is being used as the block counter. The increment of
state[13] is used to make state[13] to be the upper 32-bits of the
block counter. We *should* be reseeding more often than 2**32 calls
to chacha20_block(), and the increment is just to be safe in case
something goes wronng and we're not reseeding.

We're using crng[14] to be contain the output of RDRAND, so this is
where we mix in the contribution from a CPU-level hwrng. Previously
we called RDRAND multiple times and XOR'ed the results into the
output. This is a bit faster and is more comforting for paranoiacs
who are concerned that Intel might have down something shifty enough
to be able to read from memory and change the output of RDRAND to
force a particular output from the RNG.

Ware using state[15] to contain the NUMA node id. This was because
originally the used used the same key bytes (state[4..11]) between
different NUMA state arrays, so state[15] was used to guarantee that
state arrays would be different between different NUMA node state
arrays. Now that we are deriving the key from a primary_crng, we
could use state[15] for something else, including as a second
destination from RDRAND.

> Now I have to wear my (ugly) FIPS heat: we need that code from the current
> implementation here:
>
> if (fips_enabled) {
> spin_lock_irqsave(&r->lock, flags);
> if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
> panic("Hardware RNG duplicated output!\n");
> memcpy(r->last_data, tmp, EXTRACT_SIZE);
> spin_unlock_irqrestore(&r->lock, flags);
> }
>

I'll add FIPS support as a separate patch. I personally consider FIPS
support to be a distraction, and it's only useful for people who need
to sell to governments (mostly US governments).

> > - if (unlikely(nonblocking_pool.initialized == 0))
> > - printk_once(KERN_NOTICE "random: %s urandom read "
> > - "with %d bits of entropy available\n",
> > - current->comm, nonblocking_pool.entropy_total);
> > -
> > + crng_wait_ready();
>
> Just for clarification: are you now blocking /dev/urandom until the CRNG is
> filled? That would be a big win.

Until the the fast init state, yes. In practice we are blocking until
128 interrupts have occurred, which during boot is hapens quickly
enough that even on a simple KVM system, this happens before userspace
starts up. There *is* a risk here, though. Imagine a embedded
platform with very few interrupt-driven devices so device probing
isn't enough to make the CRNG ready when the initial ramdisk starts
up, and the init program tries to read from /dev/urandom --- which
then blocks, potentially indefinitely.

If that happens, then we will have broken userspace, and we may need
to revert this part of the change. But I'm willing to take the risk,
in hopes that such super-simplisitic devices don't exist in real life,
and if they do, the IOT devices will probably be blithely ignoring
cryptographic concerns so much that they aren't using /dev/urandom
anyway. :-)

>Would it make sense to add another chacha20_block() call here at the end?
>Note, the one thing about the SP800-90A DRBG I really like is the enhanced
>backward secrecy support which is implemented by "updating" the internal state
>(the key / state) used for one or more random number generation rounds after
>one request for random numbers is satisfied.
>
>This means that even if the state becomes known or the subsequent caller
>manages to deduce the state of the RNG to some degree of confidence, he cannot
>backtrack the already generated random numbers.

That's a good idea; being able to prevent back-tracking attacks is
a good thing. I'll add this in the next version.

Cheers,

- Ted

2016-05-04 17:49:17

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
> > +static inline u32 rotl32(u32 v, u8 n)
> > +{
> > + return (v << n) | (v >> (sizeof(v) * 8 - n));
> > +}
>
> That's undefined behavior when n=0.

Sure, but it's never called with n = 0; I've double checked and the
compiler seems to do the right thing with the above pattern as well.

Hmm, it looks like there is a "standard" version rotate left and right
defined in include/linux/bitops.h. So I suspect it would make sense
to use rol32 as defined in bitops.h --- and this is probably something
that we should do for the rest of crypto/*.c, where people seem to be
defininig their own version of something like rotl32 (I copied the
contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern
of defining one's own version of rol32 isn't new).

> I think the portable way to do a rotate that avoids UB is the
> following. GCC, Clang and ICC recognize the pattern, and emit a rotate
> instruction.
>
> static const unsigned int MASK=31;
> return (v<<n)|(v>>(-n&MASK));
>
> You should also avoid the following because its not constant time due
> to the branch:
>
> return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
>

Where is this coming from? I don't see this construct in the patch.

- Ted

2016-05-04 17:52:44

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On May 4, 2016 10:30:41 AM PDT, [email protected] wrote:
>On Tue, May 03, 2016 at 10:50:25AM +0200, Stephan Mueller wrote:
>> > +/*
>> > + * crng_init = 0 --> Uninitialized
>> > + * 2 --> Initialized
>> > + * 3 --> Initialized from input_pool
>> > + */
>> > +static int crng_init = 0;
>>
>> shouldn't that be an atomic_t ?
>
>The crng_init variable only gets incremented (it progresses from
>0->1->2->3) and it's protected by the primary_crng->lock spinlock.
>There are a few places where we read it without first taking the lock,
>but that's where we depend on it getting incremented and where if we
>race with another CPU which has just bumped the crng_init status, it's
>not critical. (Or we can take the lock and then recheck the crng_init
>if we really need to be sure.)
>
>> > +static void _initialize_crng(struct crng_state *crng)
>> > +{
>> > + int i;
>> > + unsigned long rv;
>>
>> Why do you use unsigned long here? I thought the state[i] is unsigned
>int.
>
>Because it gets filled in via arch_get_random_seed_long(&rv) and
>arch_get_random_log(&rv). If that means we get 64 bits and we only
>use 32 bits, that's OK....
>
>> Would it make sense to add the ChaCha20 self test vectors from
>RFC7539 here to
>> test that the ChaCha20 works?
>
>We're not doing that for any of the other ciphers and hashes. We
>didn't do that for SHA1, and the chacha20 code where I took this from
>didn't check for this as well. What threat are you most worried
>about. We don't care about chacha20 being exactly chacha20, so long
>as it's cryptographically strong. In fact I think I removed a
>potential host order byteswap in the set key operation specifically
>because we don't care and interop.
>
>If this is required for FIPS mode, we can add that later. I was
>primarily trying to keep the patch small to be easier for people to
>look at it, so I've deliberately left off bells and whistles that
>aren't strictly needed to show that the new approach is sound.
>
>> > + if (crng_init++ >= 2)
>> > + wake_up_interruptible(&crng_init_wait);
>>
>> Don't we have a race here with the crng_init < 3 check in crng_reseed
>
>> considering multi-core systems?
>
>No, because we are holding the primary_crng->lock spinlock. I'll add
>a comment explaining the locking protections which is intended for
>crng_init where we declare it.
>
>
>> > + if (num < 16 || num > 32) {
>> > + WARN_ON(1);
>> > + pr_err("crng_reseed: num is %d?!?\n", num);
>> > + }
>> > + num_words = (num + 3) / 4;
>> > + for (i = 0; i < num_words; i++)
>> > + primary_crng.state[i+4] ^= tmp[i];
>> > + primary_crng.init_time = jiffies;
>> > + if (crng_init < 3) {
>>
>> Shouldn't that one be if (crng_init < 3 && num >= 16) ?
>
>I'll just change the above WRN_ON test to be:
>
> BUG_ON(num < 16 || num > 32);
>
>This really can't happen, and I had it as a WARN_ON with a printk for
>debugging purpose in case I was wrong about how the code works.
>
>> > + crng_init = 3;
>> > + process_random_ready_list();
>> > + wake_up_interruptible(&crng_init_wait);
>> > + pr_notice("random: crng_init 3\n");
>>
>> Would it make sense to be more descriptive here to allow readers of
>dmesg to
>> understand the output?
>
>Yes, what we're currently printing during the initialization:
>
>random: crng_init 1
>random: crng_init 2
>random: crng_init 3
>
>was again mostly for debugging purposes. What I'm thinking about
>doing is changing crng_init 2 and 3 messages to be:
>
>random: crng fast init done
>random: crng conservative init done
>
>> > + }
>> > + ret = 1;
>> > +out:
>> > + spin_unlock_irqrestore(&primary_crng.lock, flags);
>>
>> memzero_explicit of tmp?
>
>Good point, I've added a call to memzero_explicit().
>
>> > +static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE])
>> > +{
>> > + unsigned long v, flags;
>> > + struct crng_state *crng = &primary_crng;
>> > +
>> > + if (crng_init > 2 &&
>> > + time_after(jiffies, crng->init_time + CRNG_RESEED_INTERVAL))
>> > + crng_reseed(&input_pool);
>> > + spin_lock_irqsave(&crng->lock, flags);
>> > + if (arch_get_random_long(&v))
>> > + crng->state[14] ^= v;
>> > + chacha20_block(&crng->state[0], out);
>> > + if (crng->state[12] == 0)
>> > + crng->state[13]++;
>
>> What is the purpose to only cover the 2nd 32 bit value of the nonce
>with
>> arch_get_random?
>>
>> state[12]++? Or why do you increment the nonce?
>
>In Chacha20, its output is a funcrtion of the state array, where
>state[0..3] is a constant specified by the Chacha20 definition,
>state[4..11] is the Key, and state[12..15] is the IV. The
>chacha20_block() function increments state[12] each time it is called,
>so state[12] is being used as the block counter. The increment of
>state[13] is used to make state[13] to be the upper 32-bits of the
>block counter. We *should* be reseeding more often than 2**32 calls
>to chacha20_block(), and the increment is just to be safe in case
>something goes wronng and we're not reseeding.
>
>We're using crng[14] to be contain the output of RDRAND, so this is
>where we mix in the contribution from a CPU-level hwrng. Previously
>we called RDRAND multiple times and XOR'ed the results into the
>output. This is a bit faster and is more comforting for paranoiacs
>who are concerned that Intel might have down something shifty enough
>to be able to read from memory and change the output of RDRAND to
>force a particular output from the RNG.
>
>Ware using state[15] to contain the NUMA node id. This was because
>originally the used used the same key bytes (state[4..11]) between
>different NUMA state arrays, so state[15] was used to guarantee that
>state arrays would be different between different NUMA node state
>arrays. Now that we are deriving the key from a primary_crng, we
>could use state[15] for something else, including as a second
>destination from RDRAND.
>
>> Now I have to wear my (ugly) FIPS heat: we need that code from the
>current
>> implementation here:
>>
>> if (fips_enabled) {
>> spin_lock_irqsave(&r->lock, flags);
>> if (!memcmp(tmp, r->last_data, EXTRACT_SIZE))
>> panic("Hardware RNG duplicated
>output!\n");
>> memcpy(r->last_data, tmp, EXTRACT_SIZE);
>> spin_unlock_irqrestore(&r->lock, flags);
>> }
>>
>
>I'll add FIPS support as a separate patch. I personally consider FIPS
>support to be a distraction, and it's only useful for people who need
>to sell to governments (mostly US governments).
>
>> > - if (unlikely(nonblocking_pool.initialized == 0))
>> > - printk_once(KERN_NOTICE "random: %s urandom read "
>> > - "with %d bits of entropy available\n",
>> > - current->comm, nonblocking_pool.entropy_total);
>> > -
>> > + crng_wait_ready();
>>
>> Just for clarification: are you now blocking /dev/urandom until the
>CRNG is
>> filled? That would be a big win.
>
>Until the the fast init state, yes. In practice we are blocking until
>128 interrupts have occurred, which during boot is hapens quickly
>enough that even on a simple KVM system, this happens before userspace
>starts up. There *is* a risk here, though. Imagine a embedded
>platform with very few interrupt-driven devices so device probing
>isn't enough to make the CRNG ready when the initial ramdisk starts
>up, and the init program tries to read from /dev/urandom --- which
>then blocks, potentially indefinitely.
>
>If that happens, then we will have broken userspace, and we may need
>to revert this part of the change. But I'm willing to take the risk,
>in hopes that such super-simplisitic devices don't exist in real life,
>and if they do, the IOT devices will probably be blithely ignoring
>cryptographic concerns so much that they aren't using /dev/urandom
>anyway. :-)
>
>>Would it make sense to add another chacha20_block() call here at the
>end?
>>Note, the one thing about the SP800-90A DRBG I really like is the
>enhanced
>>backward secrecy support which is implemented by "updating" the
>internal state
>>(the key / state) used for one or more random number generation rounds
>after
>>one request for random numbers is satisfied.
>>
>>This means that even if the state becomes known or the subsequent
>caller
>>manages to deduce the state of the RNG to some degree of confidence,
>he cannot
>>backtrack the already generated random numbers.
>
>That's a good idea; being able to prevent back-tracking attacks is
>a good thing. I'll add this in the next version.
>
>Cheers,
>
> - Ted

Why not use arch_get_random*_int()
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-04 18:22:26

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On Wed, May 4, 2016 at 1:49 PM, <[email protected]> wrote:
> On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
>> > +static inline u32 rotl32(u32 v, u8 n)
>> > +{
>> > + return (v << n) | (v >> (sizeof(v) * 8 - n));
>> > +}
>>
>> That's undefined behavior when n=0.
>
> Sure, but it's never called with n = 0; I've double checked and the
> compiler seems to do the right thing with the above pattern as well.

> Hmm, it looks like there is a "standard" version rotate left and right
> defined in include/linux/bitops.h. So I suspect it would make sense
> to use rol32 as defined in bitops.h --- and this is probably something

bitops.h could work in this case, but its not an ideal solution. GCC
does not optimize the code below as expected under all use cases
because GCC does not recognize it as a rotate (see
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157):

return (v << n) | (v >> (sizeof(v) * 8 - n));

And outside of special cases like Salsa, ChaCha and BLAKE2, the code
provided in bitops.h suffers UB on arbitrary data. So I think care
needs to be taken when selecting functions from bitops.h.

> that we should do for the rest of crypto/*.c, where people seem to be
> defininig their own version of something like rotl32 (I copied the
> contents of crypto/chacha20_generic.c to lib/chacha20, so this pattern
> of defining one's own version of rol32 isn't new).

Yeah, I kind of thought there was some duplication going on.

But I think bitops.h should be fixed. Many folks don't realize the
lurking UB, and many folks don't realize its not always optimized
well.

>> I think the portable way to do a rotate that avoids UB is the
>> following. GCC, Clang and ICC recognize the pattern, and emit a rotate
>> instruction.
>>
>> static const unsigned int MASK=31;
>> return (v<<n)|(v>>(-n&MASK));
>>
>> You should also avoid the following because its not constant time due
>> to the branch:
>>
>> return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
>>
>
> Where is this coming from? I don't see this construct in the patch.

My bad... It was a general observation. I've seen folks try to correct
the UB by turning to something like that.

Jeff

2016-05-04 18:29:57

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On May 4, 2016 11:22:25 AM PDT, Jeffrey Walton <[email protected]> wrote:
>On Wed, May 4, 2016 at 1:49 PM, <[email protected]> wrote:
>> On Wed, May 04, 2016 at 10:40:20AM -0400, Jeffrey Walton wrote:
>>> > +static inline u32 rotl32(u32 v, u8 n)
>>> > +{
>>> > + return (v << n) | (v >> (sizeof(v) * 8 - n));
>>> > +}
>>>
>>> That's undefined behavior when n=0.
>>
>> Sure, but it's never called with n = 0; I've double checked and the
>> compiler seems to do the right thing with the above pattern as well.
>
>> Hmm, it looks like there is a "standard" version rotate left and
>right
>> defined in include/linux/bitops.h. So I suspect it would make sense
>> to use rol32 as defined in bitops.h --- and this is probably
>something
>
>bitops.h could work in this case, but its not an ideal solution. GCC
>does not optimize the code below as expected under all use cases
>because GCC does not recognize it as a rotate (see
>http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157):
>
> return (v << n) | (v >> (sizeof(v) * 8 - n));
>
>And outside of special cases like Salsa, ChaCha and BLAKE2, the code
>provided in bitops.h suffers UB on arbitrary data. So I think care
>needs to be taken when selecting functions from bitops.h.
>
>> that we should do for the rest of crypto/*.c, where people seem to be
>> defininig their own version of something like rotl32 (I copied the
>> contents of crypto/chacha20_generic.c to lib/chacha20, so this
>pattern
>> of defining one's own version of rol32 isn't new).
>
>Yeah, I kind of thought there was some duplication going on.
>
>But I think bitops.h should be fixed. Many folks don't realize the
>lurking UB, and many folks don't realize its not always optimized
>well.
>
>>> I think the portable way to do a rotate that avoids UB is the
>>> following. GCC, Clang and ICC recognize the pattern, and emit a
>rotate
>>> instruction.
>>>
>>> static const unsigned int MASK=31;
>>> return (v<<n)|(v>>(-n&MASK));
>>>
>>> You should also avoid the following because its not constant time
>due
>>> to the branch:
>>>
>>> return n == 0 ? v : (v << n) | (v >> (sizeof(v) * 8 - n));
>>>
>>
>> Where is this coming from? I don't see this construct in the patch.
>
>My bad... It was a general observation. I've seen folks try to correct
>the UB by turning to something like that.
>
>Jeff

We don't care about UB, we care about gcc, and to a lesser extent LLVM and ICC. If bitops.h doesn't do the right thing, we need to fix bitops.h.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-04 19:07:43

by Ted Ts'o

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On Wed, May 04, 2016 at 11:29:57AM -0700, H. Peter Anvin wrote:
>
> We don't care about UB, we care about gcc, and to a lesser extent
> LLVM and ICC. If bitops.h doesn't do the right thing, we need to
> fix bitops.h.

I'm going to suggest that we treat the ro[rl]{32,64}() question as
separable from the /dev/random case. I've sanity checked gcc 5.3.1
and it does the right thing given the small number of constant
arguments given to rotl32() in lib/chacha20.c, and it doesn't hit the
UB case which Jeffrey was concerned about. This is also code that was
previously in crypto/chacha20_generic.c, and so if there is a bug with
some obscure compiler not properly compiling it down to a rotate
instruction, (a) no one is paying me to make sure the kernel code
compiles well on ICC, and (b) it's not scalable to have each kernel
developer try to deal with the vagrancies of compilers.

So from my perspective, the only interesting results for me is (a)
using what had been used before with crypto/chacha20_generic.c, or (b)
reusing what is in bitops.h and letting it be someone else's problem
if some obscure compiler isn't happy with what is in bitops.h

If we are all agreed that what is in bitops.h is considered valid,
then we can start converting people over to using the version defined
in bitops.h, and if there is some compiler issue we need to work
around, at least we only need to put the workaround in a single header
file.

Cheers,
- Ted

2016-05-04 20:53:48

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On 05/04/2016 12:07 PM, [email protected] wrote:
>
> If we are all agreed that what is in bitops.h is considered valid,
> then we can start converting people over to using the version defined
> in bitops.h, and if there is some compiler issue we need to work
> around, at least we only need to put the workaround in a single header
> file.
>

Yes, please.

-hpa

2016-05-04 21:49:36

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On 05/04/2016 12:07 PM, [email protected] wrote:

> it doesn't hit the
> UB case which Jeffrey was concerned about.

That should be good enough for present purposes....

However, in the interests of long-term maintainability, I
would suggest sticking in a comment or assertion saying
that ror32(,shift) is never called with shift=0. This
can be removed if/when bitops.h is upgraded.

There is a track record of compilers doing Bad Things in
response to UB code, including some very counterintuitive
Bad Things.

On Wed, May 04, 2016 at 11:29:57AM -0700, H. Peter Anvin wrote:
>>
>> If bitops.h doesn't do the right thing, we need to
>> fix bitops.h.

Most of the ror and rol functions in linux/bitops.h
should be considered unsafe, as currently implemented.
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/linux/bitops.h?id=04974df8049fc4240d22759a91e035082ccd18b4#n103

I don't see anything in the file that suggests any limits
on the range of the second argument. So at best it is an
undocumented trap for the unwary. This has demonstrably
been a problem in the past. The explanation in the attached
fix-rol32.diff makes amusing reading.

Of the eight functions
ror64, rol64, ror32, rol32, ror16, rol16, ror8, and rol8,
only one of them can handle shifting by zero, namely rol32.
It was upgraded on Thu Dec 3 22:04:01 2015; see the attached
fix-rol32.diff.

I find it very odd that the other seven functions were not
upgraded. I suggest the attached fix-others.diff would make
things more consistent.

Beware that shifting by an amount >= the number of bits in the
word remains Undefined Behavior. This should be either documented
or fixed. It could be fixed easily enough.

Attachments:

fix-rol32.diff (1.81 kB)
fix-others.diff (2.43 kB)
Download all attachments

2016-05-04 21:52:29

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On 05/04/2016 02:42 PM, I wrote:

> I find it very odd that the other seven functions were not
> upgraded. I suggest the attached fix-others.diff would make
> things more consistent.

Here's a replacement patch.
Same idea, less brain damage.
Sorry for the confusion.

Attachments:

fix-others.diff (2.43 kB)

2016-05-04 21:56:14

[permalink] [raw]

Subject: Re: [PATCH 1/3] random: replace non-blocking pool with a Chacha20-based CRNG

On May 4, 2016 2:42:53 PM PDT, John Denker <[email protected]> wrote:
>On 05/04/2016 12:07 PM, [email protected] wrote:
>
>> it doesn't hit the
>> UB case which Jeffrey was concerned about.
>
>That should be good enough for present purposes....
>
>However, in the interests of long-term maintainability, I
>would suggest sticking in a comment or assertion saying
>that ror32(,shift) is never called with shift=0. This
>can be removed if/when bitops.h is upgraded.
>
>There is a track record of compilers doing Bad Things in
>response to UB code, including some very counterintuitive
>Bad Things.
>
>On Wed, May 04, 2016 at 11:29:57AM -0700, H. Peter Anvin wrote:
>>>
>>> If bitops.h doesn't do the right thing, we need to
>>> fix bitops.h.
>
>Most of the ror and rol functions in linux/bitops.h
>should be considered unsafe, as currently implemented.
>http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/include/linux/bitops.h?id=04974df8049fc4240d22759a91e035082ccd18b4#n103
>
>I don't see anything in the file that suggests any limits
>on the range of the second argument. So at best it is an
>undocumented trap for the unwary. This has demonstrably
>been a problem in the past. The explanation in the attached
>fix-rol32.diff makes amusing reading.
>
>Of the eight functions
> ror64, rol64, ror32, rol32, ror16, rol16, ror8, and rol8,
>only one of them can handle shifting by zero, namely rol32.
>It was upgraded on Thu Dec 3 22:04:01 2015; see the attached
>fix-rol32.diff.
>
>I find it very odd that the other seven functions were not
>upgraded. I suggest the attached fix-others.diff would make
>things more consistent.
>
>Beware that shifting by an amount >= the number of bits in the
>word remains Undefined Behavior. This should be either documented
>or fixed. It could be fixed easily enough.

This construct has been supported as a rotate since at least gcc2.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-04 22:06:06

[permalink] [raw]

Subject: Re: linux/bitops.h

On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>> Beware that shifting by an amount >= the number of bits in the
>> word remains Undefined Behavior.

> This construct has been supported as a rotate since at least gcc2.

How then should we understand the story told in commit d7e35dfa?
Is the story wrong?

At the very least, something inconsistent is going on. There
are 8 functions. Why did d7e35dfa change one of them but
not the other 7?

2016-05-04 23:06:42

by Andi Kleen

[permalink] [raw]

Subject: Re: linux/bitops.h

On Wed, May 04, 2016 at 03:06:04PM -0700, John Denker wrote:
> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
> >> Beware that shifting by an amount >= the number of bits in the
> >> word remains Undefined Behavior.
>
> > This construct has been supported as a rotate since at least gcc2.
>
> How then should we understand the story told in commit d7e35dfa?
> Is the story wrong?

I don't think Linux runs on a system where it would make a difference
(like a VAX), and also gcc always converts it before it could.
Even UBSan should not complain because it runs after the conversion
to ROTATE.

So it's unlikely to be a pressing issue.

-Andi

--
[email protected] -- Speaking for myself only.

2016-05-05 00:13:02

[permalink] [raw]

Subject: Re: linux/bitops.h

On 05/04/2016 04:06 PM, Andi Kleen wrote:

> gcc always converts it before it could
[make a difference].

At the moment, current versions of gcc treat the idiomatic
ror/rol code as something they support ... but older versions
do not, and future version may not.

The gcc guys have made it very clear that they reserve the
right to do absolutely anything they want in a UB situation.
-- What is true as of today might not be "always" true.
-- What is true at one level of optimization might not be
true at another.
-- The consequences can be highly nonlocal and counterintuitive.
For example, in the case of:
rslt = word << (32 - N);
...
...
if (!N) { ....... }
the compiler could assume that N is necessarily nonzero,
and many lines later it could optimize out the whole
if-block. So, even if the "<<" operator gives the right
result, there could be ghastly failures elsewhere. It
might work for some people but not others.

> So it's unlikely to be a pressing issue.

Sometimes issues that are not urgently "pressing" ought
to be dealt with in a systematic way.

There are serious people who think that avoiding UB is
a necessity, if you want the code to be reliable and
maintainable.

I renew the question: Why did commit d7e35dfa upgrade
one of the 8 functions but not the other 7?
-- I could understand 0 of 8, or 8 of 8.
-- In contrast, I'm having a hard time understanding
why 7 of the 8 use the idiomatic expression while the
8th does not.

2016-05-05 00:31:21

[permalink] [raw]

Subject: Re: linux/bitops.h

On 05/04/16 15:06, John Denker wrote:
> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>>> Beware that shifting by an amount >= the number of bits in the
>>> word remains Undefined Behavior.
>
>> This construct has been supported as a rotate since at least gcc2.
>
> How then should we understand the story told in commit d7e35dfa?
> Is the story wrong?
>
> At the very least, something inconsistent is going on. There
> are 8 functions. Why did d7e35dfa change one of them but
> not the other 7?

Yes. d7e35dfa is baloney IMNSHO. All it does is produce worse code, and
the description even says so.

As I said, gcc has treated the former code as idiomatic since gcc 2, so
that support is beyond ancient.

-hpa

2016-05-05 00:48:40

by Linus Torvalds

[permalink] [raw]

Subject: Re: linux/bitops.h

On Wed, May 4, 2016 at 5:30 PM, H. Peter Anvin <[email protected]> wrote:
>
> Yes. d7e35dfa is baloney IMNSHO. All it does is produce worse code, and the
> description even says so.
>
> As I said, gcc has treated the former code as idiomatic since gcc 2, so that
> support is beyond ancient.

Well, we're *trying* to get clang supported, so the argument that gcc
has always supported it and compiles correct code for it is not
necessarily the whole story yet.

The problem with "32 - shift" is that it really could generate garbage
in situations where shift ends up being a constant zero..

That said, the fact that the other cases weren't changed
(rol64/ror64/ror32) does make that argument less interesting. Unless
there was some particular code that actively ended up using
"rol32(..0)" but not the other cases.

(And yes, rol32(x,0) is obviously nonsense, but it could easily happen
with inline functions or macros that end up generating that).

Note that there may be 8 "rol/ror" functions, but the 16-bit and 8-bit
ones don't have the undefined semantics. So there are only four that
had _that_ problem, although I agree that changing just one out of
four is wrong.

Linus

2016-05-05 01:20:32

[permalink] [raw]

Subject: Re: linux/bitops.h

On Wed, May 4, 2016 at 7:06 PM, Andi Kleen <[email protected]> wrote:
> On Wed, May 04, 2016 at 03:06:04PM -0700, John Denker wrote:
>> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>> >> Beware that shifting by an amount >= the number of bits in the
>> >> word remains Undefined Behavior.
>>
>> > This construct has been supported as a rotate since at least gcc2.
>>
>> How then should we understand the story told in commit d7e35dfa?
>> Is the story wrong?
>
> I don't think Linux runs on a system where it would make a difference
> (like a VAX), and also gcc always converts it before it could.
> Even UBSan should not complain because it runs after the conversion
> to ROTATE.
>
>From what I understand, its a limitation in the barrel shifter and the
way the shift bits are handled.

Linux runs on a great number of devices, so its conceivable (likely?)
a low-cost board would have hardware limitations that not found in
modern desktops and servers or VAX.

Jeff

2016-05-05 01:27:59

[permalink] [raw]

Subject: Re: linux/bitops.h

On May 4, 2016 6:20:32 PM PDT, Jeffrey Walton <[email protected]> wrote:
>On Wed, May 4, 2016 at 7:06 PM, Andi Kleen <[email protected]> wrote:
>> On Wed, May 04, 2016 at 03:06:04PM -0700, John Denker wrote:
>>> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>>> >> Beware that shifting by an amount >= the number of bits in the
>>> >> word remains Undefined Behavior.
>>>
>>> > This construct has been supported as a rotate since at least gcc2.
>>>
>>> How then should we understand the story told in commit d7e35dfa?
>>> Is the story wrong?
>>
>> I don't think Linux runs on a system where it would make a difference
>> (like a VAX), and also gcc always converts it before it could.
>> Even UBSan should not complain because it runs after the conversion
>> to ROTATE.
>>
>From what I understand, its a limitation in the barrel shifter and the
>way the shift bits are handled.
>
>Linux runs on a great number of devices, so its conceivable (likely?)
>a low-cost board would have hardware limitations that not found in
>modern desktops and servers or VAX.
>
>Jeff

This is a compiler feature, though. And if icc or clang don't support the idiom they would never have compiled a working kernel.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-05 01:35:45

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On Wed, May 4, 2016 at 5:52 PM, John Denker <[email protected]> wrote:
> On 05/04/2016 02:42 PM, I wrote:
>
>> I find it very odd that the other seven functions were not
>> upgraded. I suggest the attached fix-others.diff would make
>> things more consistent.
>
> Here's a replacement patch.
> ...

+1, commit it.

Its good for three additional reasons. First, this change means the
kernel is teaching the next generation the correct way to do things.
Many developers aspire to be kernel hackers, and they sometimes use
the code from bitops.h. I've actually run across the code in
production during an audit where the developers cited bitops.h.

Second, it preserves a "silent and dark" cockpit during analysis. That
is, when analysis is run, no findings are generated. Auditors and
security folks like quiet tools, much like the way pilots like their
cockpits (flashing lights and sounding buzzers usually means something
is wrong).

Third, the pattern is recognized by the major compilers, so the kernel
should not have any trouble when porting any of the compilers. I often
use multiple compiler to tease out implementation defined behavior in
a effort to achieve greater portability. Here are the citations to
ensure the kernel is safe with the pattern:

* GCC: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157
* ICC: http://software.intel.com/en-us/forums/topic/580884

However, Clang may cause trouble because they don't want the
responsibility of recognizing the pattern:

* https://llvm.org/bugs/show_bug.cgi?id=24226#c8

Instead, they provide a defective rotate. The "defect" here is its
non-constant time due to the branch, so it may not be suitable for
high-integrity or high-assurance code like linux-crypto:

* https://llvm.org/bugs/show_bug.cgi?id=24226#c5

Jeff

2016-05-05 02:42:24

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On May 4, 2016 6:35:44 PM PDT, Jeffrey Walton <[email protected]> wrote:
>On Wed, May 4, 2016 at 5:52 PM, John Denker <[email protected]> wrote:
>> On 05/04/2016 02:42 PM, I wrote:
>>
>>> I find it very odd that the other seven functions were not
>>> upgraded. I suggest the attached fix-others.diff would make
>>> things more consistent.
>>
>> Here's a replacement patch.
>> ...
>
>+1, commit it.
>
>Its good for three additional reasons. First, this change means the
>kernel is teaching the next generation the correct way to do things.
>Many developers aspire to be kernel hackers, and they sometimes use
>the code from bitops.h. I've actually run across the code in
>production during an audit where the developers cited bitops.h.
>
>Second, it preserves a "silent and dark" cockpit during analysis. That
>is, when analysis is run, no findings are generated. Auditors and
>security folks like quiet tools, much like the way pilots like their
>cockpits (flashing lights and sounding buzzers usually means something
>is wrong).
>
>Third, the pattern is recognized by the major compilers, so the kernel
>should not have any trouble when porting any of the compilers. I often
>use multiple compiler to tease out implementation defined behavior in
>a effort to achieve greater portability. Here are the citations to
>ensure the kernel is safe with the pattern:
>
> * GCC: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157
> * ICC: http://software.intel.com/en-us/forums/topic/580884
>
>However, Clang may cause trouble because they don't want the
>responsibility of recognizing the pattern:
>
> * https://llvm.org/bugs/show_bug.cgi?id=24226#c8
>
>Instead, they provide a defective rotate. The "defect" here is its
>non-constant time due to the branch, so it may not be suitable for
>high-integrity or high-assurance code like linux-crypto:
>
> * https://llvm.org/bugs/show_bug.cgi?id=24226#c5
>
>Jeff

So you are actually saying outright that we should sacrifice *actual* portability in favor of *theoretical* portability? What kind of twilight zone did we just step into?!
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-05 02:54:12

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On Wed, May 4, 2016 at 10:41 PM, H. Peter Anvin <[email protected]> wrote:
> On May 4, 2016 6:35:44 PM PDT, Jeffrey Walton <[email protected]> wrote:
>>On Wed, May 4, 2016 at 5:52 PM, John Denker <[email protected]> wrote:
>>> On 05/04/2016 02:42 PM, I wrote:
>>>
>>>> I find it very odd that the other seven functions were not
>>>> upgraded. I suggest the attached fix-others.diff would make
>>>> things more consistent.
>>>
>>> Here's a replacement patch.
>>> ...
>>
>>+1, commit it.
>>
>>Its good for three additional reasons. First, this change means the
>>kernel is teaching the next generation the correct way to do things.
>>Many developers aspire to be kernel hackers, and they sometimes use
>>the code from bitops.h. I've actually run across the code in
>>production during an audit where the developers cited bitops.h.
>>
>>Second, it preserves a "silent and dark" cockpit during analysis. That
>>is, when analysis is run, no findings are generated. Auditors and
>>security folks like quiet tools, much like the way pilots like their
>>cockpits (flashing lights and sounding buzzers usually means something
>>is wrong).
>>
>>Third, the pattern is recognized by the major compilers, so the kernel
>>should not have any trouble when porting any of the compilers. I often
>>use multiple compiler to tease out implementation defined behavior in
>>a effort to achieve greater portability. Here are the citations to
>>ensure the kernel is safe with the pattern:
>>
>> * GCC: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157
>> * ICC: http://software.intel.com/en-us/forums/topic/580884
>>
>>However, Clang may cause trouble because they don't want the
>>responsibility of recognizing the pattern:
>>
>> * https://llvm.org/bugs/show_bug.cgi?id=24226#c8
>>
>>Instead, they provide a defective rotate. The "defect" here is its
>>non-constant time due to the branch, so it may not be suitable for
>>high-integrity or high-assurance code like linux-crypto:
>>
>> * https://llvm.org/bugs/show_bug.cgi?id=24226#c5
>>
>>Jeff
>
> So you are actually saying outright that we should sacrifice *actual* portability in favor of *theoretical* portability? What kind of twilight zone did we just step into?!

I'm not sure what you mean. It will be well defined on all platforms.
Clang may not recognize the pattern, which means they could run
slower. GCC and ICC will be fine.

Slower but correct code is what you have to live with until the Clang
dev's fix their compiler.

Its kind of like what Dr. Jon Bentley said: "If it doesn't have to be
correct, I can make it as fast as you'd like it to be".

Jeff

2016-05-05 03:08:46

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On May 4, 2016 7:54:12 PM PDT, Jeffrey Walton <[email protected]> wrote:
>On Wed, May 4, 2016 at 10:41 PM, H. Peter Anvin <[email protected]> wrote:
>> On May 4, 2016 6:35:44 PM PDT, Jeffrey Walton <[email protected]>
>wrote:
>>>On Wed, May 4, 2016 at 5:52 PM, John Denker <[email protected]> wrote:
>>>> On 05/04/2016 02:42 PM, I wrote:
>>>>
>>>>> I find it very odd that the other seven functions were not
>>>>> upgraded. I suggest the attached fix-others.diff would make
>>>>> things more consistent.
>>>>
>>>> Here's a replacement patch.
>>>> ...
>>>
>>>+1, commit it.
>>>
>>>Its good for three additional reasons. First, this change means the
>>>kernel is teaching the next generation the correct way to do things.
>>>Many developers aspire to be kernel hackers, and they sometimes use
>>>the code from bitops.h. I've actually run across the code in
>>>production during an audit where the developers cited bitops.h.
>>>
>>>Second, it preserves a "silent and dark" cockpit during analysis.
>That
>>>is, when analysis is run, no findings are generated. Auditors and
>>>security folks like quiet tools, much like the way pilots like their
>>>cockpits (flashing lights and sounding buzzers usually means
>something
>>>is wrong).
>>>
>>>Third, the pattern is recognized by the major compilers, so the
>kernel
>>>should not have any trouble when porting any of the compilers. I
>often
>>>use multiple compiler to tease out implementation defined behavior in
>>>a effort to achieve greater portability. Here are the citations to
>>>ensure the kernel is safe with the pattern:
>>>
>>> * GCC: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157
>>> * ICC: http://software.intel.com/en-us/forums/topic/580884
>>>
>>>However, Clang may cause trouble because they don't want the
>>>responsibility of recognizing the pattern:
>>>
>>> * https://llvm.org/bugs/show_bug.cgi?id=24226#c8
>>>
>>>Instead, they provide a defective rotate. The "defect" here is its
>>>non-constant time due to the branch, so it may not be suitable for
>>>high-integrity or high-assurance code like linux-crypto:
>>>
>>> * https://llvm.org/bugs/show_bug.cgi?id=24226#c5
>>>
>>>Jeff
>>
>> So you are actually saying outright that we should sacrifice *actual*
>portability in favor of *theoretical* portability? What kind of
>twilight zone did we just step into?!
>
>I'm not sure what you mean. It will be well defined on all platforms.
>Clang may not recognize the pattern, which means they could run
>slower. GCC and ICC will be fine.
>
>Slower but correct code is what you have to live with until the Clang
>dev's fix their compiler.
>
>Its kind of like what Dr. Jon Bentley said: "If it doesn't have to be
>correct, I can make it as fast as you'd like it to be".
>
>Jeff

The current code works on all compilers we care about. The code you propose does not; it doesn't work on anything but very recent versions of our flagship target compiler, and pretty your own admission might even cause security hazards in the kernel if compiled on clang.

That qualifies as insane in my book. The church code is de facto portable with the intended outcome, the one you propose is not.
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-05 03:30:25

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

>>> So you are actually saying outright that we should sacrifice *actual*
>>portability in favor of *theoretical* portability? What kind of
>>twilight zone did we just step into?!
>>
>>I'm not sure what you mean. It will be well defined on all platforms.
>>Clang may not recognize the pattern, which means they could run
>>slower. GCC and ICC will be fine.
>>
>>Slower but correct code is what you have to live with until the Clang
>>dev's fix their compiler.
>>
>>Its kind of like what Dr. Jon Bentley said: "If it doesn't have to be
>>correct, I can make it as fast as you'd like it to be".
>
> The current code works on all compilers we care about. The code you propose does not; it doesn't work on anything but very recent versions of our flagship target compiler, and pretty your own admission might even cause security hazards in the kernel if compiled on clang.

I'm not sure how you're arriving at the conclusion the code does not work.

> That qualifies as insane in my book.

OK, thanks.

I see the kernel is providing IPSec, SSL/TLS, etc. You can make
SSL/TLS run faster by using aNULL and eNULL.

Jeff

2016-05-05 03:50:39

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

Instead of arguing over who's "sane" or "insane", can we come up with
a agreed upon set of tests, and a set of compiler and compiler
versions for which these tests must achieve at least *working* code?
Bonus points if they achieve optimal code, but what's important is
that for a wide range of GCC versions (from ancient RHEL distributions
to bleeding edge gcc 5.x compilers) this *must* work.

>From my perspective, clang and ICC producing corret code is a very
nice to have, but most shops I know of don't yet assume that clang
produces code that is trustworthy for production systems, although it
*is* great for as far as generating compiler warnings to find
potential bugs.

But instead of arguing over what works and doesn't, let's just create
the the test set and just try it on a wide range of compilers and
architectures, hmmm?

- Ted

2016-05-05 04:03:30

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On Wed, May 4, 2016 at 11:50 PM, Theodore Ts'o <[email protected]> wrote:
> ...
> But instead of arguing over what works and doesn't, let's just create
> the the test set and just try it on a wide range of compilers and
> architectures, hmmm?

What are the requirements? Here's a short list:

* No undefined behavior
- important because the compiler writers use the C standard
* Compiles to native "rotate IMMEDIATE" if the rotate amount is a
"constant expression" and the machine provides it
- translates to a native rotate instruction if available
- "rotate IMM" can be 3 times faster than "rotate REG"
- do any architectures *not* provide a rotate?
* Compiles to native "rotate REGISTER" if the rotate is variable and
the machine provides it
- do any architectures *not* provide a rotate?
* Constant time
- important to high-integrity code
- Non-security code paths probably don't care

Maybe the first thing to do is provide a different rotates for the
constant-time requirement when its in effect?

Jeff

2016-05-05 06:35:51

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On 05/04/16 21:03, Jeffrey Walton wrote:
> On Wed, May 4, 2016 at 11:50 PM, Theodore Ts'o <[email protected]> wrote:
>> ...
>> But instead of arguing over what works and doesn't, let's just create
>> the the test set and just try it on a wide range of compilers and
>> architectures, hmmm?
>
> What are the requirements? Here's a short list:
>
> * No undefined behavior
> - important because the compiler writers use the C standard
> * Compiles to native "rotate IMMEDIATE" if the rotate amount is a
> "constant expression" and the machine provides it
> - translates to a native rotate instruction if available
> - "rotate IMM" can be 3 times faster than "rotate REG"
> - do any architectures *not* provide a rotate?
> * Compiles to native "rotate REGISTER" if the rotate is variable and
> the machine provides it
> - do any architectures *not* provide a rotate?
> * Constant time
> - important to high-integrity code
> - Non-security code paths probably don't care
>
> Maybe the first thing to do is provide a different rotates for the
> constant-time requirement when its in effect?
>

The disagreement here is the priority between these points. In my very
strong opinion, "no undefined behavior" per the C standard is way less
important than the others; what matters is what gcc and the other
compilers we care about do. The kernel relies on various versions of
C-standard-undefined behavior *all over the place*; for one thing
sizeof(void *) == sizeof(size_t) == sizeof(unsigned long)!! but they are
well-defined in the subcontext we care about.

(And no, not all architectures provide a rotate instruction.)

-hpa

2016-05-05 16:15:38

[permalink] [raw]

Subject: Re: UB in general ... and linux/bitops.h in particular

On 05/04/2016 11:35 PM, H. Peter Anvin wrote:

> The disagreement here is the priority between these points.

Yes.

As usual, all the extremes are wrong.
Tradeoffs must be made.
Perspective and judgment are required.

> In my very strong opinion, "no undefined behavior" per the C standard
> is way less important than the others; what matters is what gcc and
> the other compilers we care about do.

But we don't control what the compilers do. The gcc guys have
a track record of assuming that UB gives them a license to do
whatever they want. At any moment they can change their mind
and do something new.

> The kernel relies on various versions of C-standard-undefined
> behavior *all over the place*;>

One should be careful with that argument. Not all types of UB
are created equal. There is a world of difference between
-- UB_type_1 used "all over the place" by necessity, and
-- UB_type_2 used here-and-there for convenience.

UB_type_1 defines a de_facto dialect of the language.

Ideally there would be a formal specification of the dialect,
but that's not super-urgent, because the compiler guys are
probably not crazy enough to break something if it really is
used "all over the place".

Formalized or not, UB_type_1 does not make it OK for programmers
to invoke *other* types of UB. I'll say it again: the gcc guys
have a track record of assuming UB gives them a license to do
whatever they want. The results can be very counterintuitive.

UB is a nightmare from the point of view of reliability,
security, and maintainability. The fact that your favorite
compiler does what you want as of today is *not* a guarantee
that it will do so in the future.

===========

As for the suggestion that the UB code is somehow more
efficient or in any way better, I'm not buying it.

Using gcc 5.2.1 I observe that [1] and [2] (given below)
generate the exact same code at any optimization level
from -O1 on up. My kernel is compiled with -O2.

(They generate different code with -O0 but that doesn't
seem like an important consideration.)

The relevant code is
(word >> shift) | (word >> ((-shift) & 31)); /* [1] */
(word >> shift) | (word << (32 - shift)); /* [2] */

> for one thing sizeof(void *) == sizeof(size_t) == sizeof(unsigned long)!!

I assume that comes up in the context of type punning. I
am not a language lawyer, but it is my understanding that
type punning *is* permitted by the C language specification.
(C++ is another story entirely, but not relevant here.)

There is a world of difference between
-- loosely specified options (LSO), and
-- undefined behavior (UB)

The sizeof() example is LSO not UB. One could easily check
the sizes at compile time, so that no looseness remains.
The result is perfectly reasonable, efficient, reliable code.

Similarly, the kernel assumes two's complement arithmetic
"all over the place" but this is LSO not UB.

This is relevant to linux/bitops.h because [2] is UB when
shift==0. In contrast [1] is a very mild example of LSO
because it assumes two's complement.

I consider it a step in the right direction to get rid of UB
when it can be done at zero cost. UB is dangerous.

==========================

Suggestions:

a) Going forward, I suggest that UB should not be invoked
unless there is a good solid reason.

b) In cases where a this-or-that UB really is needed, it
should be carefully documented.

-- Perhaps there could be a file linux_kernel_dialect.c
that gives examples of all the constructs that the kernel
needs but are not in the basic C specification. One would
hope this would be very, very short.

-- Perhaps the compiler guys could be persuaded to support
the needed features explicitly, perhaps via a command-line
option: -std=vanilla
This should be a no-cost option as things stand today, but
it helps to prevent nasty surprises in the future.

2016-05-05 17:32:18

by Andi Kleen

[permalink] [raw]

Subject: Re: UB in general ... and linux/bitops.h in particular

> Suggestions:
>
> a) Going forward, I suggest that UB should not be invoked
> unless there is a good solid reason.

Good luck rewriting most of the kernel source.

This discussion is insane!

-Andi

2016-05-05 21:34:52

by Sandy Harris

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On Wed, May 4, 2016 at 11:50 PM, Theodore Ts'o <[email protected]> wrote:

> Instead of arguing over who's "sane" or "insane", can we come up with
> a agreed upon set of tests, and a set of compiler and compiler
> versions ...

I completely fail to see why tests or compiler versions should be
part of the discussion. The C standard says the behaviour in
certain cases is undefined, so a standard-compliant compiler
can generate more-or-less any code there.

As long as any of portability, reliability or security are among our
goals, any code that can give undefined behaviour should be
considered problematic.

> But instead of arguing over what works and doesn't, let's just create
> the the test set and just try it on a wide range of compilers and
> architectures, hmmm?

No. Let's just fix the code so that undefined behaviour cannot occur.

Creating test cases for a fix and trying them on a range of systems
would be useful, perhaps essential, work. Doing tests without a fix
would be a complete waste of time.

2016-05-05 22:18:27

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On Thu, May 05, 2016 at 05:34:50PM -0400, Sandy Harris wrote:
>
> I completely fail to see why tests or compiler versions should be
> part of the discussion. The C standard says the behaviour in
> certain cases is undefined, so a standard-compliant compiler
> can generate more-or-less any code there.
>

> As long as any of portability, reliability or security are among our
> goals, any code that can give undefined behaviour should be
> considered problematic.

Because compilers have been known not necessarily to obey the specs,
and/or interpret the specs in way that not everyone agrees with. It's
also the case that we are *already* disabling certain C optimizations
which are technically allowed by the spec, but which kernel
programmers consider insane (e.g., strict aliasing).

And of course, memzero_explicit() which crypto people understand is
necessary, is something that technically compilers are allowed to
optimize according to the spec. So trying to write secure kernel code
which will work on arbitrary compilers may well be impossible.

And which is also why people have said (mostly in jest), "A
sufficiently advanced compiler is indistinguishable from an
adversary." (I assume people will agree that optimizing away a memset
needed to clear secrets from memory would be considered adversarial,
at the very least!)

So this is why I tend to take a much more pragmatic viewpoint on
things. Sure, it makes sense to pay attention to what the C standard
writers are trying to do to us; but if we need to suppress certain
optimizations to write sane kernel code --- I'm ok with that. And
this is why using a trust-but-verify on a specific set of compilers
and ranges of compiler versions is a really good idea....

- Ted

2016-05-05 22:22:27

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On 05/05/2016 03:18 PM, [email protected] wrote:
> On Thu, May 05, 2016 at 05:34:50PM -0400, Sandy Harris wrote:
>>
>> I completely fail to see why tests or compiler versions should be
>> part of the discussion. The C standard says the behaviour in
>> certain cases is undefined, so a standard-compliant compiler
>> can generate more-or-less any code there.
>>
>
>> As long as any of portability, reliability or security are among our
>> goals, any code that can give undefined behaviour should be
>> considered problematic.
>
> Because compilers have been known not necessarily to obey the specs,
> and/or interpret the specs in way that not everyone agrees with. It's
> also the case that we are *already* disabling certain C optimizations
> which are technically allowed by the spec, but which kernel
> programmers consider insane (e.g., strict aliasing).
>
> And of course, memzero_explicit() which crypto people understand is
> necessary, is something that technically compilers are allowed to
> optimize according to the spec. So trying to write secure kernel code
> which will work on arbitrary compilers may well be impossible.
>
> And which is also why people have said (mostly in jest), "A
> sufficiently advanced compiler is indistinguishable from an
> adversary." (I assume people will agree that optimizing away a memset
> needed to clear secrets from memory would be considered adversarial,
> at the very least!)
>
> So this is why I tend to take a much more pragmatic viewpoint on
> things. Sure, it makes sense to pay attention to what the C standard
> writers are trying to do to us; but if we need to suppress certain
> optimizations to write sane kernel code --- I'm ok with that. And
> this is why using a trust-but-verify on a specific set of compilers
> and ranges of compiler versions is a really good idea....
>

In theory, theory and practice should agree, but in practice, practice
is what counts. I fully agree we should get rid of UD behavior where
doing so is practical, but not at the cost of breaking real-life
compilers, expecially not gcc, and to a lesser but still very real
extent icc and clang.

I would also agree that we should push the gcc developers to add to the
manual C-standard-UD behavior which are well-defined under the
gnu89/gnu99/gnu11 C dialects.

-hpa

2016-05-05 22:39:00

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On May 5, 2016 3:18:09 PM PDT, [email protected] wrote:
>On Thu, May 05, 2016 at 05:34:50PM -0400, Sandy Harris wrote:
>>
>> I completely fail to see why tests or compiler versions should be
>> part of the discussion. The C standard says the behaviour in
>> certain cases is undefined, so a standard-compliant compiler
>> can generate more-or-less any code there.
>>
>
>> As long as any of portability, reliability or security are among our
>> goals, any code that can give undefined behaviour should be
>> considered problematic.
>
>Because compilers have been known not necessarily to obey the specs,
>and/or interpret the specs in way that not everyone agrees with. It's
>also the case that we are *already* disabling certain C optimizations
>which are technically allowed by the spec, but which kernel
>programmers consider insane (e.g., strict aliasing).
>
>And of course, memzero_explicit() which crypto people understand is
>necessary, is something that technically compilers are allowed to
>optimize according to the spec. So trying to write secure kernel code
>which will work on arbitrary compilers may well be impossible.
>
>And which is also why people have said (mostly in jest), "A
>sufficiently advanced compiler is indistinguishable from an
>adversary." (I assume people will agree that optimizing away a memset
>needed to clear secrets from memory would be considered adversarial,
>at the very least!)
>
>So this is why I tend to take a much more pragmatic viewpoint on
>things. Sure, it makes sense to pay attention to what the C standard
>writers are trying to do to us; but if we need to suppress certain
>optimizations to write sane kernel code --- I'm ok with that. And
>this is why using a trust-but-verify on a specific set of compilers
>and ranges of compiler versions is a really good idea....
>
> - Ted

I have filed a gcc bug to have the preexisting rotate idiom officially documented as a GNU C extension.

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70967
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-06 00:14:23

[permalink] [raw]

Subject: Re: better patch for linux/bitops.h

On 05/05/2016 03:18 PM, [email protected] wrote:
>
> So this is why I tend to take a much more pragmatic viewpoint on
> things. Sure, it makes sense to pay attention to what the C standard
> writers are trying to do to us; but if we need to suppress certain
> optimizations to write sane kernel code --- I'm ok with that. And
> this is why using a trust-but-verify on a specific set of compilers
> and ranges of compiler versions is a really good idea....
>

For the record, the "portable" construct has apparently only been
supported since gcc 4.6.3.

-hpa

2016-05-06 02:25:17

[permalink] [raw]

Subject: Re: UB in general ... and linux/bitops.h in particular

> -- Perhaps the compiler guys could be persuaded to support
> the needed features explicitly, perhaps via a command-line
> option: -std=vanilla
> This should be a no-cost option as things stand today, but
> it helps to prevent nasty surprises in the future.

It looks LLVM has the -rainbow option; see
http://blog.llvm.org/2016/04/undefined-behavior-is-magic.html :)

Jeff

2016-05-06 20:07:57

by Sasha Levin

[permalink] [raw]

Subject: Re: linux/bitops.h

On 05/04/2016 08:30 PM, H. Peter Anvin wrote:
> On 05/04/16 15:06, John Denker wrote:
>> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>>>> Beware that shifting by an amount >= the number of bits in the
>>>> word remains Undefined Behavior.
>>
>>> This construct has been supported as a rotate since at least gcc2.
>>
>> How then should we understand the story told in commit d7e35dfa?
>> Is the story wrong?
>>
>> At the very least, something inconsistent is going on. There
>> are 8 functions. Why did d7e35dfa change one of them but
>> not the other 7?
>
> Yes. d7e35dfa is baloney IMNSHO. All it does is produce worse code, and the description even says so.

No, the description says that it produces worse code for *really really* ancient
GCC versions.

> As I said, gcc has treated the former code as idiomatic since gcc 2, so that support is beyond ancient.

Because something works in a specific way on one compiler isn't a reason to
ignore this noncompliance with the standard.

Thanks,
Sasha

2016-05-06 20:08:58

by Sasha Levin

[permalink] [raw]

Subject: Re: linux/bitops.h

On 05/04/2016 08:48 PM, Linus Torvalds wrote:
> That said, the fact that the other cases weren't changed
> (rol64/ror64/ror32) does make that argument less interesting. Unless
> there was some particular code that actively ended up using
> "rol32(..0)" but not the other cases.

Right, the others seemed wrong as well but I couldn't find any code
that triggers that, and preferred to fix just the one I was hitting.

I can go fix the rest if that's something we want to do?

Thanks,
Sasha

2016-05-06 20:25:07

[permalink] [raw]

Subject: Re: linux/bitops.h

On May 6, 2016 1:07:13 PM PDT, Sasha Levin <[email protected]> wrote:
>On 05/04/2016 08:30 PM, H. Peter Anvin wrote:
>> On 05/04/16 15:06, John Denker wrote:
>>> On 05/04/2016 02:56 PM, H. Peter Anvin wrote:
>>>>> Beware that shifting by an amount >= the number of bits in the
>>>>> word remains Undefined Behavior.
>>>
>>>> This construct has been supported as a rotate since at least gcc2.
>>>
>>> How then should we understand the story told in commit d7e35dfa?
>>> Is the story wrong?
>>>
>>> At the very least, something inconsistent is going on. There
>>> are 8 functions. Why did d7e35dfa change one of them but
>>> not the other 7?
>>
>> Yes. d7e35dfa is baloney IMNSHO. All it does is produce worse code,
>and the description even says so.
>
>No, the description says that it produces worse code for *really
>really* ancient
>GCC versions.
>
>> As I said, gcc has treated the former code as idiomatic since gcc 2,
>so that support is beyond ancient.
>
>Because something works in a specific way on one compiler isn't a
>reason to
>ignore this noncompliance with the standard.
>
>
>Thanks,
>Sasha

4.6.2 is not "really, really ancient."
--
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.

2016-05-06 20:30:48