2023-01-18 21:42:20

by Dmitry Safonov

[permalink] [raw]
Subject: [PATCH v4 1/4] crypto: Introduce crypto_pool

Introduce a per-CPU pool of async crypto requests that can be used
in bh-disabled contexts (designed with net RX/TX softirqs as users in
mind). Allocation can sleep and is a slow-path.
Initial implementation has only ahash as a backend and a fix-sized array
of possible algorithms used in parallel.

Signed-off-by: Dmitry Safonov <[email protected]>
---
crypto/Kconfig | 3 +
crypto/Makefile | 1 +
crypto/crypto_pool.c | 333 ++++++++++++++++++++++++++++++++++++++++++
include/crypto/pool.h | 46 ++++++
4 files changed, 383 insertions(+)
create mode 100644 crypto/crypto_pool.c
create mode 100644 include/crypto/pool.h

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 9c86f7045157..7096654419cb 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1388,6 +1388,9 @@ endmenu
config CRYPTO_HASH_INFO
bool

+config CRYPTO_POOL
+ tristate
+
if !KMSAN # avoid false positives from assembly
if ARM
source "arch/arm/crypto/Kconfig"
diff --git a/crypto/Makefile b/crypto/Makefile
index d0126c915834..eed8f61bc93b 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_CRYPTO_ACOMP2) += crypto_acompress.o
cryptomgr-y := algboss.o testmgr.o

obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
+obj-$(CONFIG_CRYPTO_POOL) += crypto_pool.o
obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
crypto_user-y := crypto_user_base.o
crypto_user-$(CONFIG_CRYPTO_STATS) += crypto_user_stat.o
diff --git a/crypto/crypto_pool.c b/crypto/crypto_pool.c
new file mode 100644
index 000000000000..0237dad937b8
--- /dev/null
+++ b/crypto/crypto_pool.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/pool.h>
+#include <linux/cpu.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+
+static size_t __scratch_size;
+static DEFINE_PER_CPU(void __rcu *, crypto_pool_scratch);
+
+struct crypto_pool_entry {
+ struct ahash_request * __percpu *req;
+ const char *alg;
+ struct kref kref;
+ bool needs_key;
+};
+
+#define CPOOL_SIZE (PAGE_SIZE/sizeof(struct crypto_pool_entry))
+static struct crypto_pool_entry cpool[CPOOL_SIZE];
+static unsigned int cpool_populated;
+static DEFINE_MUTEX(cpool_mutex);
+
+/* Slow-path */
+struct scratches_to_free {
+ struct rcu_head rcu;
+ unsigned int cnt;
+ void *scratches[];
+};
+static void free_old_scratches(struct rcu_head *head)
+{
+ struct scratches_to_free *stf;
+
+ stf = container_of(head, struct scratches_to_free, rcu);
+ while (stf->cnt--)
+ kfree(stf->scratches[stf->cnt]);
+ kfree(stf);
+}
+/*
+ * crypto_pool_reserve_scratch - re-allocates scratch buffer, slow-path
+ * @size: request size for the scratch/temp buffer
+ */
+static int crypto_pool_reserve_scratch(size_t size)
+{
+ struct scratches_to_free *stf;
+ size_t stf_sz = struct_size(stf, scratches, num_possible_cpus());
+ int cpu, err = 0;
+
+ lockdep_assert_held(&cpool_mutex);
+ if (__scratch_size >= size)
+ return 0;
+
+ stf = kmalloc(stf_sz, GFP_KERNEL);
+ if (!stf)
+ return -ENOMEM;
+ stf->cnt = 0;
+
+ cpus_read_lock();
+ for_each_possible_cpu(cpu) {
+ void *scratch, *old_scratch;
+
+ scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (!scratch) {
+ err = -ENOMEM;
+ break;
+ }
+
+ old_scratch = rcu_replace_pointer(per_cpu(crypto_pool_scratch, cpu), scratch, lockdep_is_held(&cpool_mutex));
+ if (!cpu_online(cpu) || !old_scratch) {
+ kfree(old_scratch);
+ continue;
+ }
+ stf->scratches[stf->cnt++] = old_scratch;
+ }
+ cpus_read_unlock();
+ if (!err)
+ __scratch_size = size;
+
+ call_rcu(&stf->rcu, free_old_scratches);
+ return err;
+}
+
+static void crypto_pool_scratch_free(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ kfree(rcu_replace_pointer(per_cpu(crypto_pool_scratch, cpu),
+ NULL, lockdep_is_held(&cpool_mutex)));
+ __scratch_size = 0;
+}
+
+static int __cpool_alloc_ahash(struct crypto_pool_entry *e, const char *alg)
+{
+ struct crypto_ahash *hash, *cpu0_hash;
+ int cpu, ret = -ENOMEM;
+
+ e->alg = kstrdup(alg, GFP_KERNEL);
+ if (!e->alg)
+ return -ENOMEM;
+
+ e->req = alloc_percpu(struct ahash_request *);
+ if (!e->req)
+ goto out_free_alg;
+
+ cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cpu0_hash)) {
+ ret = PTR_ERR(cpu0_hash);
+ goto out_free_req;
+ }
+
+ /* If hash has .setkey(), allocate ahash per-CPU, not only request */
+ e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY;
+
+ hash = cpu0_hash;
+ for_each_possible_cpu(cpu) {
+ struct ahash_request *req;
+
+ /*
+ * If ahash has a key - it has to be allocated per-CPU.
+ * In such case re-use for CPU0 hash that just have been
+ * allocated above.
+ */
+ if (!hash)
+ hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hash))
+ goto out_free_per_cpu;
+
+ req = ahash_request_alloc(hash, GFP_KERNEL);
+ if (!req)
+ goto out_free_hash;
+
+ ahash_request_set_callback(req, 0, NULL, NULL);
+
+ *per_cpu_ptr(e->req, cpu) = req;
+
+ if (e->needs_key)
+ hash = NULL;
+ }
+ kref_init(&e->kref);
+ return 0;
+
+out_free_hash:
+ if (hash != cpu0_hash)
+ crypto_free_ahash(hash);
+
+out_free_per_cpu:
+ for_each_possible_cpu(cpu) {
+ struct ahash_request *req = *per_cpu_ptr(e->req, cpu);
+ struct crypto_ahash *pcpu_hash;
+
+ if (req == NULL)
+ break;
+ pcpu_hash = crypto_ahash_reqtfm(req);
+ ahash_request_free(req);
+ /* hash per-CPU, e->needs_key == true */
+ if (pcpu_hash != cpu0_hash)
+ crypto_free_ahash(pcpu_hash);
+ }
+
+ crypto_free_ahash(cpu0_hash);
+out_free_req:
+ free_percpu(e->req);
+out_free_alg:
+ kfree(e->alg);
+ e->alg = NULL;
+ return ret;
+}
+
+/**
+ * crypto_pool_alloc_ahash - allocates pool for ahash requests
+ * @alg: name of async hash algorithm
+ * @scratch_size: reserve a crypto_pool::scratch buffer of this size
+ */
+int crypto_pool_alloc_ahash(const char *alg, size_t scratch_size)
+{
+ int i, ret;
+
+ /* slow-path */
+ mutex_lock(&cpool_mutex);
+ ret = crypto_pool_reserve_scratch(scratch_size);
+ if (ret)
+ goto out;
+ for (i = 0; i < cpool_populated; i++) {
+ if (cpool[i].alg && !strcmp(cpool[i].alg, alg)) {
+ if (kref_read(&cpool[i].kref) > 0)
+ kref_get(&cpool[i].kref);
+ else
+ kref_init(&cpool[i].kref);
+ ret = i;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < cpool_populated; i++) {
+ if (!cpool[i].alg)
+ break;
+ }
+ if (i >= CPOOL_SIZE) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = __cpool_alloc_ahash(&cpool[i], alg);
+ if (!ret) {
+ ret = i;
+ if (i == cpool_populated)
+ cpool_populated++;
+ }
+out:
+ mutex_unlock(&cpool_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_pool_alloc_ahash);
+
+static void __cpool_free_entry(struct crypto_pool_entry *e)
+{
+ struct crypto_ahash *hash = NULL;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (*per_cpu_ptr(e->req, cpu) == NULL)
+ continue;
+
+ hash = crypto_ahash_reqtfm(*per_cpu_ptr(e->req, cpu));
+ ahash_request_free(*per_cpu_ptr(e->req, cpu));
+ if (e->needs_key) {
+ crypto_free_ahash(hash);
+ hash = NULL;
+ }
+ }
+ if (hash)
+ crypto_free_ahash(hash);
+ free_percpu(e->req);
+ kfree(e->alg);
+ memset(e, 0, sizeof(*e));
+}
+
+static void cpool_cleanup_work_cb(struct work_struct *work)
+{
+ unsigned int i;
+ bool free_scratch = true;
+
+ mutex_lock(&cpool_mutex);
+ for (i = 0; i < cpool_populated; i++) {
+ if (kref_read(&cpool[i].kref) > 0) {
+ free_scratch = false;
+ continue;
+ }
+ if (!cpool[i].alg)
+ continue;
+ __cpool_free_entry(&cpool[i]);
+ }
+ if (free_scratch)
+ crypto_pool_scratch_free();
+ mutex_unlock(&cpool_mutex);
+}
+
+static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb);
+static void cpool_schedule_cleanup(struct kref *kref)
+{
+ schedule_work(&cpool_cleanup_work);
+}
+
+/**
+ * crypto_pool_release - decreases number of users for a pool. If it was
+ * the last user of the pool, releases any memory that was consumed.
+ * @id: crypto_pool that was previously allocated by crypto_pool_alloc_ahash()
+ */
+void crypto_pool_release(unsigned int id)
+{
+ if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg))
+ return;
+
+ /* slow-path */
+ kref_put(&cpool[id].kref, cpool_schedule_cleanup);
+}
+EXPORT_SYMBOL_GPL(crypto_pool_release);
+
+/**
+ * crypto_pool_get - increases number of users (refcounter) for a pool
+ * @id: crypto_pool that was previously allocated by crypto_pool_alloc_ahash()
+ */
+void crypto_pool_get(unsigned int id)
+{
+ if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg))
+ return;
+ kref_get(&cpool[id].kref);
+}
+EXPORT_SYMBOL_GPL(crypto_pool_get);
+
+int crypto_pool_start(unsigned int id, struct crypto_pool *c)
+{
+ struct crypto_pool_ahash *ret = (struct crypto_pool_ahash *)c;
+
+ rcu_read_lock_bh();
+ if (WARN_ON_ONCE(id > cpool_populated || !cpool[id].alg)) {
+ rcu_read_unlock_bh();
+ return -EINVAL;
+ }
+ ret->req = *this_cpu_ptr(cpool[id].req);
+ /*
+ * Pairs with crypto_pool_reserve_scratch(), scratch area is
+ * valid (allocated) until crypto_pool_end().
+ */
+ ret->base.scratch = rcu_dereference_bh(*this_cpu_ptr(&crypto_pool_scratch));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_pool_start);
+
+/**
+ * crypto_pool_algo - return algorithm of crypto_pool
+ * @id: crypto_pool that was previously allocated by crypto_pool_alloc_ahash()
+ * @buf: buffer to return name of algorithm
+ * @buf_len: size of @buf
+ */
+size_t crypto_pool_algo(unsigned int id, char *buf, size_t buf_len)
+{
+ size_t ret = 0;
+
+ /* slow-path */
+ mutex_lock(&cpool_mutex);
+ if (cpool[id].alg)
+ ret = strscpy(buf, cpool[id].alg, buf_len);
+ mutex_unlock(&cpool_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_pool_algo);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
diff --git a/include/crypto/pool.h b/include/crypto/pool.h
new file mode 100644
index 000000000000..e266c1cba7de
--- /dev/null
+++ b/include/crypto/pool.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _CRYPTO_POOL_H
+#define _CRYPTO_POOL_H
+
+#include <crypto/hash.h>
+
+/**
+ * struct crypto_pool - generic type for different crypto requests
+ * @scratch: per-CPU temporary area, that can be used between
+ * crypto_pool_start() and crypto_pool_end() to perform
+ * crypto requests
+ */
+struct crypto_pool {
+ void *scratch;
+};
+
+/**
+ * struct crypto_pool_ahash - per-CPU pool of ahash_requests
+ * @base: common members that can be used by any async crypto ops
+ * @req: pre-allocated ahash request
+ */
+struct crypto_pool_ahash {
+ struct crypto_pool base;
+ struct ahash_request *req;
+};
+
+int crypto_pool_alloc_ahash(const char *alg, size_t scratch_size);
+void crypto_pool_get(unsigned int id);
+void crypto_pool_release(unsigned int id);
+
+/**
+ * crypto_pool_start - disable bh and start using crypto_pool
+ * @id: crypto_pool that was previously allocated by crypto_pool_alloc_ahash()
+ * @c: returned crypto_pool for usage (uninitialized on failure)
+ */
+int crypto_pool_start(unsigned int id, struct crypto_pool *c);
+/**
+ * crypto_pool_end - enable bh and stop using crypto_pool
+ */
+static inline void crypto_pool_end(void)
+{
+ rcu_read_unlock_bh();
+}
+size_t crypto_pool_algo(unsigned int id, char *buf, size_t buf_len);
+
+#endif /* _CRYPTO_POOL_H */
--
2.39.0


2023-01-19 10:02:27

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] crypto: Introduce crypto_pool

On Wed, Jan 18, 2023 at 09:41:08PM +0000, Dmitry Safonov wrote:
> Introduce a per-CPU pool of async crypto requests that can be used
> in bh-disabled contexts (designed with net RX/TX softirqs as users in
> mind). Allocation can sleep and is a slow-path.
> Initial implementation has only ahash as a backend and a fix-sized array
> of possible algorithms used in parallel.
>
> Signed-off-by: Dmitry Safonov <[email protected]>
> ---
> crypto/Kconfig | 3 +
> crypto/Makefile | 1 +
> crypto/crypto_pool.c | 333 ++++++++++++++++++++++++++++++++++++++++++
> include/crypto/pool.h | 46 ++++++
> 4 files changed, 383 insertions(+)
> create mode 100644 crypto/crypto_pool.c
> create mode 100644 include/crypto/pool.h

I'm still nacking this.

I'm currently working on per-request keys which should render
this unnecessary. With per-request keys you can simply do an
atomic kmalloc when you compute the hash.

Modelling tcp_md5 is just propagating bad code.

Thanks,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2023-01-19 18:14:42

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] crypto: Introduce crypto_pool

Hi Herbert,

On 1/19/23 09:51, Herbert Xu wrote:
> On Wed, Jan 18, 2023 at 09:41:08PM +0000, Dmitry Safonov wrote:
>> Introduce a per-CPU pool of async crypto requests that can be used
>> in bh-disabled contexts (designed with net RX/TX softirqs as users in
>> mind). Allocation can sleep and is a slow-path.
>> Initial implementation has only ahash as a backend and a fix-sized array
>> of possible algorithms used in parallel.
>>
>> Signed-off-by: Dmitry Safonov <[email protected]>
>> ---
>> crypto/Kconfig | 3 +
>> crypto/Makefile | 1 +
>> crypto/crypto_pool.c | 333 ++++++++++++++++++++++++++++++++++++++++++
>> include/crypto/pool.h | 46 ++++++
>> 4 files changed, 383 insertions(+)
>> create mode 100644 crypto/crypto_pool.c
>> create mode 100644 include/crypto/pool.h
>
> I'm still nacking this.
>
> I'm currently working on per-request keys which should render
> this unnecessary. With per-request keys you can simply do an
> atomic kmalloc when you compute the hash.

Adding per-request keys sounds like a real improvement to me.
But that is not the same issue I'm addressing here. I'm maybe bad at
describing or maybe I just don't see how per-request keys would help.
Let me describe the problem I'm solving again and please feel free to
correct inline or suggest alternatives.

The initial need for crypto_pool comes from TCP-AO implementation that
I'm pusing upstream, see RFC5925 that describes the option and the
latest version of patch set is in [1]. In that patch set hashing is used
in a similar way to TCP-MD5: crypto_alloc_ahash() is a slow-path in
setsockopt() and the use of pre-allocated requests in fast path, TX/RX
softirqs.

For TCP-AO 2 algorithms are "must have" in any compliant implementation,
according to RFC5926: HMAC-SHA-1-96 and AES-128-CMAC-96, other
algorithms are optional. But having in mind that sha1, as you know, is
not secure to collision attacks, some customers prefer to have/use
stronger hashes. In other words, TCP-AO implementation needs 2+ hashing
algorithms to be used in a similar manner as TCP-MD5 uses MD5 hashing.

And than, I look around and I see that the same pattern (slow allocation
of crypto request and usage on a fast-path with bh disabled) is used in
other places over kernel:
- here I convert to crypto_pool seg6_hmac & tcp-md5
- net/ipv4/ah4.c could benefit from it: currently it allocates
crypto_alloc_ahash() per every connection, allocating user-specified
hash algorithm with ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0),
which are not shared between each other and it doesn't provide
pre-allocated temporary/scratch buffer to calculate hash, so it uses
GFP_ATOMIC in ah_alloc_tmp()
- net/ipv6/ah6.c is copy'n'paste of the above
- net/ipv4/esp4.c and net/ipv6/esp6.c are more-or-less also copy'n'paste
with crypto_alloc_aead() instead of crypto_alloc_ahash()
- net/mac80211/ - another example of the same pattern, see even the
comment in ieee80211_key_alloc() where the keys are allocated and the
usage in net/mac80211/{rx,tx}.c with bh-disabled
- net/xfrm/xfrm_ipcomp.c has its own manager for different compression
algorithms that are used in quite the same fashion. The significant
exception is scratch area: it's IPCOMP_SCRATCH_SIZE=65400. So, if it
could be shared with other crypto users that do the same pattern
(bh-disabled usage), it would save some memory.

And those are just fast-grep examples from net/, looking closer it may
be possible to find more potential users.
So, in crypto_pool.c it's 333 lines where is a manager that let a user
share pre-allocated ahash requests [comp, aead, may be added on top]
inside bh-disabled section as well as share a temporary/scratch buffer.
It will make it possible to remove some if not all custom managers of
the very same code pattern, some of which don't even try to share
pre-allocated tfms.

That's why I see some value in this crypto-pool thing.
If you NACK it, the alternative for TCP-AO patches would be to add just
another pool into net/ipv4/tcp.c that either copies TCP-MD5 code or
re-uses it.

I fail to see how your per-request keys patches would provide an API
alternative to this patch set. Still, users will have to manage
pre-allocated tfms and buffers.
I can actually see how your per-request keys would benefit *from* this
patch set: it will be much easier to wire per-req keys up to crypto_pool
to avoid per-CPU tfm allocation for algorithms you'll add support for.
In that case you won't have to patch crypto-pool users.

[1]:
https://lore.kernel.org/all/[email protected]/T/#u

Thanks, waiting for your input,
Dmitry

2023-01-20 08:55:27

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] crypto: Introduce crypto_pool

On Thu, Jan 19, 2023 at 06:03:40PM +0000, Dmitry Safonov wrote:
>
> - net/ipv4/ah4.c could benefit from it: currently it allocates
> crypto_alloc_ahash() per every connection, allocating user-specified
> hash algorithm with ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0),
> which are not shared between each other and it doesn't provide
> pre-allocated temporary/scratch buffer to calculate hash, so it uses
> GFP_ATOMIC in ah_alloc_tmp()
> - net/ipv6/ah6.c is copy'n'paste of the above
> - net/ipv4/esp4.c and net/ipv6/esp6.c are more-or-less also copy'n'paste
> with crypto_alloc_aead() instead of crypto_alloc_ahash()

No they should definitely not switch over to the pool model. In
fact, these provide the correct model that you should follow.

The correct model is to allocate the tfm on the control/slow path,
and allocate requests on the fast path (or reuse existing memory,
e.g., from the skb).

We have not yet explored doing the latter with IPsec but that is
certainly a possibility.

Yes I understand that this is currently impossible for hashes but
that is why I'm working on per-request keys.

> - net/xfrm/xfrm_ipcomp.c has its own manager for different compression
> algorithms that are used in quite the same fashion. The significant
> exception is scratch area: it's IPCOMP_SCRATCH_SIZE=65400. So, if it
> could be shared with other crypto users that do the same pattern
> (bh-disabled usage), it would save some memory.

IPcomp uses the legacy crypto compression interface. We now have
a new acomp interface which was specifically designed so that we
don't need to have these memory pools.

Cheers,
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

2023-01-20 19:20:55

by Dmitry Safonov

[permalink] [raw]
Subject: Re: [PATCH v4 1/4] crypto: Introduce crypto_pool

On 1/20/23 08:49, Herbert Xu wrote:
> On Thu, Jan 19, 2023 at 06:03:40PM +0000, Dmitry Safonov wrote:
>>
>> - net/ipv4/ah4.c could benefit from it: currently it allocates
>> crypto_alloc_ahash() per every connection, allocating user-specified
>> hash algorithm with ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0),
>> which are not shared between each other and it doesn't provide
>> pre-allocated temporary/scratch buffer to calculate hash, so it uses
>> GFP_ATOMIC in ah_alloc_tmp()
>> - net/ipv6/ah6.c is copy'n'paste of the above
>> - net/ipv4/esp4.c and net/ipv6/esp6.c are more-or-less also copy'n'paste
>> with crypto_alloc_aead() instead of crypto_alloc_ahash()
>
> No they should definitely not switch over to the pool model. In
> fact, these provide the correct model that you should follow.
>
> The correct model is to allocate the tfm on the control/slow path,
> and allocate requests on the fast path (or reuse existing memory,
> e.g., from the skb).

Ok, I see. Do you think, it's worth having a pool of tfms?

If not, I can proceed with TCP-AO patches set and implement pool of
ahash tfms that will be used only for TCP-MD5 and TCP-AO, does that
sound good to you?

I see that ahash tfm allocation doesn't eat a lot of memory, rather
little more than 100 bytes, but even so, I don't see why not saving some
memory "for free", especially if one can have thousands of keys over
different sockets. Where there's not much complexity in sharing tfms &
scratch buffers?

Thanks,
Dmitry