2013-08-09 17:48:57

by Frank Mayhar

[permalink] [raw]
Subject: [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

The device mapper and some of its modules allocate memory pools at
various points when setting up a device. In some cases, these pools are
fairly large, for example the multipath module allocates a 256-entry
pool and the dm itself allocates three of that size. In a
memory-constrained environment where we're creating a lot of these
devices, the memory use can quickly become significant. Unfortunately,
there's currently no way to change the size of the pools other than by
changing a constant and rebuilding the kernel.

This patch fixes that by changing the hardcoded MIN_IOS (and certain
other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
sysctl-modifiable values. This lets us change the size of these pools
on the fly, we can reduce the size of the pools and reduce memory
pressure.

We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
and dm-mpath, from a value of 32 all the way down to zero. Bearing in
mind that the underlying devices were network-based, we saw essentially
no performance degradation; if there was any, it was down in the noise.
One might wonder why these sizes are the way they are; I investigated
and they've been unchanged since at least 2006.

Signed-off-by: Frank Mayhar <[email protected]>
---
drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
drivers/md/dm-io.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
drivers/md/dm-snap.c | 45 +++++++++++++++++++++++++++++++++++-
drivers/md/dm.c | 41 ++++++++++++++++++++++++++++++---
5 files changed, 244 insertions(+), 11 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 6d2d41a..d68f10a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -178,12 +178,55 @@ struct crypt_config {
#define MIN_IOS 16
#define MIN_POOL_PAGES 32

+static int sysctl_dm_crypt_min_ios = MIN_IOS;
+static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
+
static struct kmem_cache *_crypt_io_pool;

static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);

+static struct ctl_table_header *dm_crypt_table_header;
+static ctl_table dm_crypt_ctl_table[] = {
+ {
+ .procname = "min_ios",
+ .data = &sysctl_dm_crypt_min_ios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "min_pool_pages",
+ .data = &sysctl_dm_crypt_min_pool_pages,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static ctl_table dm_crypt_dir_table[] = {
+ { .procname = "crypt",
+ .mode = 0555,
+ .child = dm_crypt_ctl_table },
+ { }
+};
+
+static ctl_table dm_crypt_dm_dir_table[] = {
+ { .procname = "dm",
+ .mode = 0555,
+ .child = dm_crypt_dir_table },
+ { }
+};
+
+static ctl_table dm_crypt_root_table[] = {
+ { .procname = "dev",
+ .mode = 0555,
+ .child = dm_crypt_dm_dir_table },
+ { }
+};
+
static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
{
return this_cpu_ptr(cc->cpu);
@@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;

ret = -ENOMEM;
- cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
+ cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
+ _crypt_io_pool);
if (!cc->io_pool) {
ti->error = "Cannot allocate crypt io mempool";
goto bad;
@@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
~(crypto_tfm_ctx_alignment() - 1);

- cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
+ cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
+ cc->dmreq_start +
sizeof(struct dm_crypt_request) + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}

- cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
+ cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
+ 0);
if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool";
goto bad;
}

- cc->bs = bioset_create(MIN_IOS, 0);
+ cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
if (!cc->bs) {
ti->error = "Cannot allocate crypt bioset";
goto bad;
@@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
kmem_cache_destroy(_crypt_io_pool);
}

+ dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
+ if (!dm_crypt_table_header) {
+ DMERR("failed to register sysctl");
+ dm_unregister_target(&crypt_target);
+ kmem_cache_destroy(_crypt_io_pool);
+ return -ENOMEM;
+ }
+
return r;
}

static void __exit dm_crypt_exit(void)
{
+ unregister_sysctl_table(dm_crypt_table_header);
dm_unregister_target(&crypt_target);
kmem_cache_destroy(_crypt_io_pool);
}
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea49834..8c45c60 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,9 @@
#define MIN_IOS 16
#define MIN_BIOS 16

+static int sysctl_dm_io_min_ios = MIN_IOS;
+static int sysctl_dm_io_min_bios = MIN_BIOS;
+
struct dm_io_client {
mempool_t *pool;
struct bio_set *bios;
@@ -44,6 +47,46 @@ struct io {

static struct kmem_cache *_dm_io_cache;

+static struct ctl_table_header *dm_io_table_header;
+static ctl_table dm_io_ctl_table[] = {
+ {
+ .procname = "min_ios",
+ .data = &sysctl_dm_io_min_ios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "min_bios",
+ .data = &sysctl_dm_io_min_bios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static ctl_table dm_io_dir_table[] = {
+ { .procname = "io",
+ .mode = 0555,
+ .child = dm_io_ctl_table },
+ { }
+};
+
+static ctl_table dm_io_dm_dir_table[] = {
+ { .procname = "dm",
+ .mode = 0555,
+ .child = dm_io_dir_table },
+ { }
+};
+
+static ctl_table dm_io_root_table[] = {
+ { .procname = "dev",
+ .mode = 0555,
+ .child = dm_io_dm_dir_table },
+ { }
+};
+
/*
* Create a client with mempool and bioset.
*/
@@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
if (!client)
return ERR_PTR(-ENOMEM);

- client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+ client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
+ _dm_io_cache);
if (!client->pool)
goto bad;

- client->bios = bioset_create(MIN_BIOS, 0);
+ client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
if (!client->bios)
goto bad;

@@ -515,11 +559,21 @@ int __init dm_io_init(void)
if (!_dm_io_cache)
return -ENOMEM;

+ dm_io_table_header = register_sysctl_table(dm_io_root_table);
+ if (!dm_io_table_header) {
+ DMERR("failed to register sysctl");
+ kmem_cache_destroy(_dm_io_cache);
+ _dm_io_cache = NULL;
+ return -ENOMEM;
+ }
+
+
return 0;
}

void dm_io_exit(void)
{
+ unregister_sysctl_table(dm_io_table_header);
kmem_cache_destroy(_dm_io_cache);
_dm_io_cache = NULL;
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5adede1..099af1b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);

#define MIN_IOS 256 /* Mempool size */

+static int sysctl_dm_mpath_min_ios = MIN_IOS;
+
static struct kmem_cache *_mpio_cache;

static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
static void trigger_event(struct work_struct *work);
static void activate_path(struct work_struct *work);

+static struct ctl_table_header *dm_mpath_table_header;
+static ctl_table dm_mpath_ctl_table[] = {
+ {
+ .procname = "min_ios",
+ .data = &sysctl_dm_mpath_min_ios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static ctl_table dm_mpath_dir_table[] = {
+ { .procname = "mpath",
+ .mode = 0555,
+ .child = dm_mpath_ctl_table },
+ { }
+};
+
+static ctl_table dm_mpath_dm_dir_table[] = {
+ { .procname = "dm",
+ .mode = 0555,
+ .child = dm_mpath_dir_table },
+ { }
+};
+
+static ctl_table dm_mpath_root_table[] = {
+ { .procname = "dev",
+ .mode = 0555,
+ .child = dm_mpath_dm_dir_table },
+ { }
+};

/*-----------------------------------------------
* Allocation routines
@@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
INIT_WORK(&m->trigger_event, trigger_event);
init_waitqueue_head(&m->pg_init_wait);
mutex_init(&m->work_mutex);
- m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+ m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
+ _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
return NULL;
@@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
kmem_cache_destroy(_mpio_cache);
return -ENOMEM;
}
+ dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
+ if (!dm_mpath_table_header) {
+ DMERR("failed to register sysctl");
+ destroy_workqueue(kmpath_handlerd);
+ destroy_workqueue(kmultipathd);
+ dm_unregister_target(&multipath_target);
+ kmem_cache_destroy(_mpio_cache);
+ return -ENOMEM;
+ }

DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
@@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)

static void __exit dm_multipath_exit(void)
{
+ unregister_sysctl_table(dm_mpath_table_header);
+
destroy_workqueue(kmpath_handlerd);
destroy_workqueue(kmultipathd);

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c434e5a..723b3c2 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
* The size of the mempool used to track chunks in use.
*/
#define MIN_IOS 256
+static int sysctl_dm_snap_min_ios = MIN_IOS;

#define DM_TRACKED_CHUNK_HASH_SIZE 16
#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
(DM_TRACKED_CHUNK_HASH_SIZE - 1))

+static struct ctl_table_header *dm_snap_table_header;
+static ctl_table dm_snap_ctl_table[] = {
+ {
+ .procname = "min_ios",
+ .data = &sysctl_dm_snap_min_ios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static ctl_table dm_snap_dir_table[] = {
+ { .procname = "snap",
+ .mode = 0555,
+ .child = dm_snap_ctl_table },
+ { }
+};
+
+static ctl_table dm_snap_dm_dir_table[] = {
+ { .procname = "dm",
+ .mode = 0555,
+ .child = dm_snap_dir_table },
+ { }
+};
+
+static ctl_table dm_snap_root_table[] = {
+ { .procname = "dev",
+ .mode = 0555,
+ .child = dm_snap_dm_dir_table },
+ { }
+};
+
struct dm_exception_table {
uint32_t hash_mask;
unsigned hash_shift;
@@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_kcopyd;
}

- s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
+ s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
+ pending_cache);
if (!s->pending_pool) {
ti->error = "Could not allocate mempool for pending exceptions";
r = -ENOMEM;
@@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
goto bad_pending_cache;
}

+ dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
+ if (!dm_snap_table_header)
+ goto bad_sysctl_table;
+
return 0;

+bad_sysctl_table:
+ kmem_cache_destroy(pending_cache);
bad_pending_cache:
kmem_cache_destroy(exception_cache);
bad_exception_cache:
@@ -2288,6 +2329,8 @@ bad_register_snapshot_target:

static void __exit dm_snapshot_exit(void)
{
+ unregister_sysctl_table(dm_snap_table_header);
+
dm_unregister_target(&snapshot_target);
dm_unregister_target(&origin_target);
dm_unregister_target(&merge_target);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9e39d2b..fdff32f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,6 +19,7 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
+#include <linux/sysctl.h>

#include <trace/events/block.h>

@@ -209,9 +210,36 @@ struct dm_md_mempools {
};

#define MIN_IOS 256
+static int sysctl_dm_min_ios = MIN_IOS;
static struct kmem_cache *_io_cache;
static struct kmem_cache *_rq_tio_cache;

+static struct ctl_table_header *dm_table_header;
+static ctl_table dm_ctl_table[] = {
+ {
+ .procname = "min_ios",
+ .data = &sysctl_dm_min_ios,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+static ctl_table dm_dir_table[] = {
+ { .procname = "dm",
+ .mode = 0555,
+ .child = dm_ctl_table },
+ { }
+};
+
+static ctl_table dm_root_table[] = {
+ { .procname = "dev",
+ .mode = 0555,
+ .child = dm_dir_table },
+ { }
+};
+
static int __init local_init(void)
{
int r = -ENOMEM;
@@ -229,16 +257,22 @@ static int __init local_init(void)
if (r)
goto out_free_rq_tio_cache;

+ dm_table_header = register_sysctl_table(dm_root_table);
+ if (!dm_table_header)
+ goto out_uevent_exit;
+
_major = major;
r = register_blkdev(_major, _name);
if (r < 0)
- goto out_uevent_exit;
+ goto out_unregister_sysctl_table;

if (!_major)
_major = r;

return 0;

+out_unregister_sysctl_table:
+ unregister_sysctl_table(dm_table_header);
out_uevent_exit:
dm_uevent_exit();
out_free_rq_tio_cache:
@@ -251,6 +285,7 @@ out_free_io_cache:

static void local_exit(void)
{
+ unregister_sysctl_table(dm_table_header);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
} else if (type == DM_TYPE_REQUEST_BASED) {
cachep = _rq_tio_cache;
- pool_size = MIN_IOS;
+ pool_size = sysctl_dm_min_ios;
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_bio_data_size is not used. See __bind_mempools(). */
WARN_ON(per_bio_data_size != 0);
} else
goto out;

- pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+ pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
if (!pools->io_pool)
goto out;


--
Frank Mayhar
310-460-4042


2013-08-13 16:42:00

by Frank Mayhar

[permalink] [raw]
Subject: Re: [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

Ping? Has anyone glanced at this?

On Fri, 2013-08-09 at 10:48 -0700, Frank Mayhar wrote:
> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device. In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size. In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant. Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
>
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values. This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.
>
> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero. Bearing in
> mind that the underlying devices were network-based, we saw essentially
> no performance degradation; if there was any, it was down in the noise.
> One might wonder why these sizes are the way they are; I investigated
> and they've been unchanged since at least 2006.
>
> Signed-off-by: Frank Mayhar <[email protected]>


--
Frank Mayhar
310-460-4042

2013-08-17 00:01:11

by Frank Mayhar

[permalink] [raw]
Subject: Re: [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

Sorry for the repeats, mailer issues (was supposed to go to dm-devel).
--
Frank Mayhar
310-460-4042

2013-08-17 12:30:44

by Alasdair G Kergon

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values. This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.

Did you consider using module_param_named()?
(E.g. dm_verity_prefetch_cluster in dm-verity.)

Alasdair

2013-08-19 13:40:43

by Mike Snitzer

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Sat, Aug 17 2013 at 8:30am -0400,
Alasdair G Kergon <[email protected]> wrote:

> On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > sysctl-modifiable values. This lets us change the size of these pools
> > on the fly, we can reduce the size of the pools and reduce memory
> > pressure.
>
> Did you consider using module_param_named()?
> (E.g. dm_verity_prefetch_cluster in dm-verity.)

Right, I'd prefer to see this be implemented in terms of
module_param_named(). Aside from dm-verity, dm-bufio.c makes the most
use of it.

Mike

2013-08-19 14:10:45

by Mike Snitzer

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Fri, Aug 16 2013 at 6:55pm -0400,
Frank Mayhar <[email protected]> wrote:

> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device. In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size. In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant. Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
>
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values. This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.

These memory reserves are a long-standing issue with DM (made worse when
request-based mpath was introduced). Two years ago, I assembled a patch
series that took one approach to trying to fix it:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/series.html

But in the end I wasn't convinced sharing the memory reserve would allow
for 100s of mpath devices to make forward progress if memory is
depleted.

All said, I think adding the ability to control the size of the memory
reserves is reasonable. It allows for informed admins to establish
lower reserves (based on the awareness that rq-based mpath doesn't need
to support really large IOs, etc) without compromising the ability to
make forward progress.

But, as mentioned in my porevious mail, I'd like to see this implemnted
in terms of module_param_named().

> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero.

Bio-based can safely be reduced, as this older (uncommitted) patch did:
http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/0000-dm-lower-bio-based-reservation.patch

> Bearing in mind that the underlying devices were network-based, we saw
> essentially no performance degradation; if there was any, it was down
> in the noise. One might wonder why these sizes are the way they are;
> I investigated and they've been unchanged since at least 2006.

Performance isn't the concern. The concern is: does DM allow for
forward progress if the system's memory is completely exhausted?

This is why request-based has such an extensive reserve, because it
needs to account for cloning the largest possible request that comes in
(with multiple bios).

2013-08-19 15:04:16

by Frank Mayhar

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Mon, 2013-08-19 at 09:40 -0400, Mike Snitzer wrote:
> On Sat, Aug 17 2013 at 8:30am -0400,
> Alasdair G Kergon <[email protected]> wrote:
>
> > On Fri, Aug 16, 2013 at 03:55:21PM -0700, Frank Mayhar wrote:
> > > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > > sysctl-modifiable values. This lets us change the size of these pools
> > > on the fly, we can reduce the size of the pools and reduce memory
> > > pressure.
> >
> > Did you consider using module_param_named()?
> > (E.g. dm_verity_prefetch_cluster in dm-verity.)
>
> Right, I'd prefer to see this be implemented in terms of
> module_param_named(). Aside from dm-verity, dm-bufio.c makes the most
> use of it.

Yeah, I can do that.
--
Frank Mayhar
310-460-4042

2013-08-19 17:54:11

by Frank Mayhar

[permalink] [raw]
Subject: Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.

On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> Performance isn't the concern. The concern is: does DM allow for
> forward progress if the system's memory is completely exhausted?
>
> This is why request-based has such an extensive reserve, because it
> needs to account for cloning the largest possible request that comes in
> (with multiple bios).

Thanks for the response. In our particular case, I/O will be file
system based and over a network, which makes it pretty easy for us to be
sure that large I/Os never happen. That notwithstanding, however, as
you said it just seems reasonable to make these values configurable.

I'm also looking at making some similar constants in dm-verity and
dm-bufio configurable in the same way and for similar reasons.
--
Frank Mayhar
310-460-4042

2013-08-19 18:15:30

by Mike Snitzer

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Mon, Aug 19 2013 at 1:54pm -0400,
Frank Mayhar <[email protected]> wrote:

> On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > Performance isn't the concern. The concern is: does DM allow for
> > forward progress if the system's memory is completely exhausted?
> >
> > This is why request-based has such an extensive reserve, because it
> > needs to account for cloning the largest possible request that comes in
> > (with multiple bios).
>
> Thanks for the response. In our particular case, I/O will be file
> system based and over a network, which makes it pretty easy for us to be
> sure that large I/Os never happen. That notwithstanding, however, as
> you said it just seems reasonable to make these values configurable.
>
> I'm also looking at making some similar constants in dm-verity and
> dm-bufio configurable in the same way and for similar reasons.

OK, would be helpful if you were to split each patch out, e.g. separate
patches for DM core, verity, bufio, etc. Reserve the background context
to the 0th patch header (or DM core patch). With more precise patch
headers that document the new tunable that is exposed by each patch.

It would also be nice to see these tunables get documented in the
appropriate Documentation/device-mapper/ file too.

Thanks for working on this. I'll have time to review/assist these
changes in the near term.

2013-08-20 21:22:28

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.



On Fri, 16 Aug 2013, Frank Mayhar wrote:

> The device mapper and some of its modules allocate memory pools at
> various points when setting up a device. In some cases, these pools are
> fairly large, for example the multipath module allocates a 256-entry
> pool and the dm itself allocates three of that size. In a
> memory-constrained environment where we're creating a lot of these
> devices, the memory use can quickly become significant. Unfortunately,
> there's currently no way to change the size of the pools other than by
> changing a constant and rebuilding the kernel.
>
> This patch fixes that by changing the hardcoded MIN_IOS (and certain
> other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> sysctl-modifiable values. This lets us change the size of these pools
> on the fly, we can reduce the size of the pools and reduce memory
> pressure.
>
> We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> and dm-mpath, from a value of 32 all the way down to zero. Bearing in
> mind that the underlying devices were network-based, we saw essentially
> no performance degradation; if there was any, it was down in the noise.
> One might wonder why these sizes are the way they are; I investigated
> and they've been unchanged since at least 2006.

I think it would be better to set the values to some low value (1 should
be enough, there is 16 in some places that is already low enough). There
is no need to make it user-configurable, because the user will see no
additional benefit from bigger mempools.

Maybe multipath is the exception - but other dm targets don't really need
big mempools so there is no need to make the size configurable.

Mikulas

> Signed-off-by: Frank Mayhar <[email protected]>
> ---
> drivers/md/dm-crypt.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++----
> drivers/md/dm-io.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
> drivers/md/dm-mpath.c | 48 ++++++++++++++++++++++++++++++++++++++-
> drivers/md/dm-snap.c | 45 +++++++++++++++++++++++++++++++++++-
> drivers/md/dm.c | 41 ++++++++++++++++++++++++++++++---
> 5 files changed, 244 insertions(+), 11 deletions(-)
> diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
> index 6d2d41a..d68f10a 100644
> --- a/drivers/md/dm-crypt.c
> +++ b/drivers/md/dm-crypt.c
> @@ -178,12 +178,55 @@ struct crypt_config {
> #define MIN_IOS 16
> #define MIN_POOL_PAGES 32
>
> +static int sysctl_dm_crypt_min_ios = MIN_IOS;
> +static int sysctl_dm_crypt_min_pool_pages = MIN_POOL_PAGES;
> +
> static struct kmem_cache *_crypt_io_pool;
>
> static void clone_init(struct dm_crypt_io *, struct bio *);
> static void kcryptd_queue_crypt(struct dm_crypt_io *io);
> static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
>
> +static struct ctl_table_header *dm_crypt_table_header;
> +static ctl_table dm_crypt_ctl_table[] = {
> + {
> + .procname = "min_ios",
> + .data = &sysctl_dm_crypt_min_ios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "min_pool_pages",
> + .data = &sysctl_dm_crypt_min_pool_pages,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + { }
> +};
> +
> +static ctl_table dm_crypt_dir_table[] = {
> + { .procname = "crypt",
> + .mode = 0555,
> + .child = dm_crypt_ctl_table },
> + { }
> +};
> +
> +static ctl_table dm_crypt_dm_dir_table[] = {
> + { .procname = "dm",
> + .mode = 0555,
> + .child = dm_crypt_dir_table },
> + { }
> +};
> +
> +static ctl_table dm_crypt_root_table[] = {
> + { .procname = "dev",
> + .mode = 0555,
> + .child = dm_crypt_dm_dir_table },
> + { }
> +};
> +
> static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
> {
> return this_cpu_ptr(cc->cpu);
> @@ -1571,7 +1614,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> goto bad;
>
> ret = -ENOMEM;
> - cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool);
> + cc->io_pool = mempool_create_slab_pool(sysctl_dm_crypt_min_ios,
> + _crypt_io_pool);
> if (!cc->io_pool) {
> ti->error = "Cannot allocate crypt io mempool";
> goto bad;
> @@ -1583,20 +1627,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
> ~(crypto_tfm_ctx_alignment() - 1);
>
> - cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
> + cc->req_pool = mempool_create_kmalloc_pool(sysctl_dm_crypt_min_ios,
> + cc->dmreq_start +
> sizeof(struct dm_crypt_request) + cc->iv_size);
> if (!cc->req_pool) {
> ti->error = "Cannot allocate crypt request mempool";
> goto bad;
> }
>
> - cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
> + cc->page_pool = mempool_create_page_pool(sysctl_dm_crypt_min_pool_pages,
> + 0);
> if (!cc->page_pool) {
> ti->error = "Cannot allocate page mempool";
> goto bad;
> }
>
> - cc->bs = bioset_create(MIN_IOS, 0);
> + cc->bs = bioset_create(sysctl_dm_crypt_min_ios, 0);
> if (!cc->bs) {
> ti->error = "Cannot allocate crypt bioset";
> goto bad;
> @@ -1851,11 +1897,20 @@ static int __init dm_crypt_init(void)
> kmem_cache_destroy(_crypt_io_pool);
> }
>
> + dm_crypt_table_header = register_sysctl_table(dm_crypt_root_table);
> + if (!dm_crypt_table_header) {
> + DMERR("failed to register sysctl");
> + dm_unregister_target(&crypt_target);
> + kmem_cache_destroy(_crypt_io_pool);
> + return -ENOMEM;
> + }
> +
> return r;
> }
>
> static void __exit dm_crypt_exit(void)
> {
> + unregister_sysctl_table(dm_crypt_table_header);
> dm_unregister_target(&crypt_target);
> kmem_cache_destroy(_crypt_io_pool);
> }
> diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
> index ea49834..8c45c60 100644
> --- a/drivers/md/dm-io.c
> +++ b/drivers/md/dm-io.c
> @@ -22,6 +22,9 @@
> #define MIN_IOS 16
> #define MIN_BIOS 16
>
> +static int sysctl_dm_io_min_ios = MIN_IOS;
> +static int sysctl_dm_io_min_bios = MIN_BIOS;
> +
> struct dm_io_client {
> mempool_t *pool;
> struct bio_set *bios;
> @@ -44,6 +47,46 @@ struct io {
>
> static struct kmem_cache *_dm_io_cache;
>
> +static struct ctl_table_header *dm_io_table_header;
> +static ctl_table dm_io_ctl_table[] = {
> + {
> + .procname = "min_ios",
> + .data = &sysctl_dm_io_min_ios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + {
> + .procname = "min_bios",
> + .data = &sysctl_dm_io_min_bios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + { }
> +};
> +
> +static ctl_table dm_io_dir_table[] = {
> + { .procname = "io",
> + .mode = 0555,
> + .child = dm_io_ctl_table },
> + { }
> +};
> +
> +static ctl_table dm_io_dm_dir_table[] = {
> + { .procname = "dm",
> + .mode = 0555,
> + .child = dm_io_dir_table },
> + { }
> +};
> +
> +static ctl_table dm_io_root_table[] = {
> + { .procname = "dev",
> + .mode = 0555,
> + .child = dm_io_dm_dir_table },
> + { }
> +};
> +
> /*
> * Create a client with mempool and bioset.
> */
> @@ -55,11 +98,12 @@ struct dm_io_client *dm_io_client_create(void)
> if (!client)
> return ERR_PTR(-ENOMEM);
>
> - client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
> + client->pool = mempool_create_slab_pool(sysctl_dm_io_min_ios,
> + _dm_io_cache);
> if (!client->pool)
> goto bad;
>
> - client->bios = bioset_create(MIN_BIOS, 0);
> + client->bios = bioset_create(sysctl_dm_io_min_bios, 0);
> if (!client->bios)
> goto bad;
>
> @@ -515,11 +559,21 @@ int __init dm_io_init(void)
> if (!_dm_io_cache)
> return -ENOMEM;
>
> + dm_io_table_header = register_sysctl_table(dm_io_root_table);
> + if (!dm_io_table_header) {
> + DMERR("failed to register sysctl");
> + kmem_cache_destroy(_dm_io_cache);
> + _dm_io_cache = NULL;
> + return -ENOMEM;
> + }
> +
> +
> return 0;
> }
>
> void dm_io_exit(void)
> {
> + unregister_sysctl_table(dm_io_table_header);
> kmem_cache_destroy(_dm_io_cache);
> _dm_io_cache = NULL;
> }
> diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
> index 5adede1..099af1b 100644
> --- a/drivers/md/dm-mpath.c
> +++ b/drivers/md/dm-mpath.c
> @@ -118,6 +118,8 @@ typedef int (*action_fn) (struct pgpath *pgpath);
>
> #define MIN_IOS 256 /* Mempool size */
>
> +static int sysctl_dm_mpath_min_ios = MIN_IOS;
> +
> static struct kmem_cache *_mpio_cache;
>
> static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
> @@ -125,6 +127,38 @@ static void process_queued_ios(struct work_struct *work);
> static void trigger_event(struct work_struct *work);
> static void activate_path(struct work_struct *work);
>
> +static struct ctl_table_header *dm_mpath_table_header;
> +static ctl_table dm_mpath_ctl_table[] = {
> + {
> + .procname = "min_ios",
> + .data = &sysctl_dm_mpath_min_ios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + { }
> +};
> +
> +static ctl_table dm_mpath_dir_table[] = {
> + { .procname = "mpath",
> + .mode = 0555,
> + .child = dm_mpath_ctl_table },
> + { }
> +};
> +
> +static ctl_table dm_mpath_dm_dir_table[] = {
> + { .procname = "dm",
> + .mode = 0555,
> + .child = dm_mpath_dir_table },
> + { }
> +};
> +
> +static ctl_table dm_mpath_root_table[] = {
> + { .procname = "dev",
> + .mode = 0555,
> + .child = dm_mpath_dm_dir_table },
> + { }
> +};
>
> /*-----------------------------------------------
> * Allocation routines
> @@ -202,7 +236,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
> INIT_WORK(&m->trigger_event, trigger_event);
> init_waitqueue_head(&m->pg_init_wait);
> mutex_init(&m->work_mutex);
> - m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
> + m->mpio_pool = mempool_create_slab_pool(sysctl_dm_mpath_min_ios,
> + _mpio_cache);
> if (!m->mpio_pool) {
> kfree(m);
> return NULL;
> @@ -1745,6 +1780,15 @@ static int __init dm_multipath_init(void)
> kmem_cache_destroy(_mpio_cache);
> return -ENOMEM;
> }
> + dm_mpath_table_header = register_sysctl_table(dm_mpath_root_table);
> + if (!dm_mpath_table_header) {
> + DMERR("failed to register sysctl");
> + destroy_workqueue(kmpath_handlerd);
> + destroy_workqueue(kmultipathd);
> + dm_unregister_target(&multipath_target);
> + kmem_cache_destroy(_mpio_cache);
> + return -ENOMEM;
> + }
>
> DMINFO("version %u.%u.%u loaded",
> multipath_target.version[0], multipath_target.version[1],
> @@ -1755,6 +1799,8 @@ static int __init dm_multipath_init(void)
>
> static void __exit dm_multipath_exit(void)
> {
> + unregister_sysctl_table(dm_mpath_table_header);
> +
> destroy_workqueue(kmpath_handlerd);
> destroy_workqueue(kmultipathd);
>
> diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
> index c434e5a..723b3c2 100644
> --- a/drivers/md/dm-snap.c
> +++ b/drivers/md/dm-snap.c
> @@ -33,11 +33,45 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
> * The size of the mempool used to track chunks in use.
> */
> #define MIN_IOS 256
> +static int sysctl_dm_snap_min_ios = MIN_IOS;
>
> #define DM_TRACKED_CHUNK_HASH_SIZE 16
> #define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \
> (DM_TRACKED_CHUNK_HASH_SIZE - 1))
>
> +static struct ctl_table_header *dm_snap_table_header;
> +static ctl_table dm_snap_ctl_table[] = {
> + {
> + .procname = "min_ios",
> + .data = &sysctl_dm_snap_min_ios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + { }
> +};
> +
> +static ctl_table dm_snap_dir_table[] = {
> + { .procname = "snap",
> + .mode = 0555,
> + .child = dm_snap_ctl_table },
> + { }
> +};
> +
> +static ctl_table dm_snap_dm_dir_table[] = {
> + { .procname = "dm",
> + .mode = 0555,
> + .child = dm_snap_dir_table },
> + { }
> +};
> +
> +static ctl_table dm_snap_root_table[] = {
> + { .procname = "dev",
> + .mode = 0555,
> + .child = dm_snap_dm_dir_table },
> + { }
> +};
> +
> struct dm_exception_table {
> uint32_t hash_mask;
> unsigned hash_shift;
> @@ -1118,7 +1152,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> goto bad_kcopyd;
> }
>
> - s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
> + s->pending_pool = mempool_create_slab_pool(sysctl_dm_snap_min_ios,
> + pending_cache);
> if (!s->pending_pool) {
> ti->error = "Could not allocate mempool for pending exceptions";
> r = -ENOMEM;
> @@ -2268,8 +2303,14 @@ static int __init dm_snapshot_init(void)
> goto bad_pending_cache;
> }
>
> + dm_snap_table_header = register_sysctl_table(dm_snap_root_table);
> + if (!dm_snap_table_header)
> + goto bad_sysctl_table;
> +
> return 0;
>
> +bad_sysctl_table:
> + kmem_cache_destroy(pending_cache);
> bad_pending_cache:
> kmem_cache_destroy(exception_cache);
> bad_exception_cache:
> @@ -2288,6 +2329,8 @@ bad_register_snapshot_target:
>
> static void __exit dm_snapshot_exit(void)
> {
> + unregister_sysctl_table(dm_snap_table_header);
> +
> dm_unregister_target(&snapshot_target);
> dm_unregister_target(&origin_target);
> dm_unregister_target(&merge_target);
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 9e39d2b..fdff32f 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -19,6 +19,7 @@
> #include <linux/idr.h>
> #include <linux/hdreg.h>
> #include <linux/delay.h>
> +#include <linux/sysctl.h>
>
> #include <trace/events/block.h>
>
> @@ -209,9 +210,36 @@ struct dm_md_mempools {
> };
>
> #define MIN_IOS 256
> +static int sysctl_dm_min_ios = MIN_IOS;
> static struct kmem_cache *_io_cache;
> static struct kmem_cache *_rq_tio_cache;
>
> +static struct ctl_table_header *dm_table_header;
> +static ctl_table dm_ctl_table[] = {
> + {
> + .procname = "min_ios",
> + .data = &sysctl_dm_min_ios,
> + .maxlen = sizeof(int),
> + .mode = S_IRUGO|S_IWUSR,
> + .proc_handler = proc_dointvec,
> + },
> + { }
> +};
> +
> +static ctl_table dm_dir_table[] = {
> + { .procname = "dm",
> + .mode = 0555,
> + .child = dm_ctl_table },
> + { }
> +};
> +
> +static ctl_table dm_root_table[] = {
> + { .procname = "dev",
> + .mode = 0555,
> + .child = dm_dir_table },
> + { }
> +};
> +
> static int __init local_init(void)
> {
> int r = -ENOMEM;
> @@ -229,16 +257,22 @@ static int __init local_init(void)
> if (r)
> goto out_free_rq_tio_cache;
>
> + dm_table_header = register_sysctl_table(dm_root_table);
> + if (!dm_table_header)
> + goto out_uevent_exit;
> +
> _major = major;
> r = register_blkdev(_major, _name);
> if (r < 0)
> - goto out_uevent_exit;
> + goto out_unregister_sysctl_table;
>
> if (!_major)
> _major = r;
>
> return 0;
>
> +out_unregister_sysctl_table:
> + unregister_sysctl_table(dm_table_header);
> out_uevent_exit:
> dm_uevent_exit();
> out_free_rq_tio_cache:
> @@ -251,6 +285,7 @@ out_free_io_cache:
>
> static void local_exit(void)
> {
> + unregister_sysctl_table(dm_table_header);
> kmem_cache_destroy(_rq_tio_cache);
> kmem_cache_destroy(_io_cache);
> unregister_blkdev(_major, _name);
> @@ -2806,14 +2841,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
> front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
> } else if (type == DM_TYPE_REQUEST_BASED) {
> cachep = _rq_tio_cache;
> - pool_size = MIN_IOS;
> + pool_size = sysctl_dm_min_ios;
> front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
> /* per_bio_data_size is not used. See __bind_mempools(). */
> WARN_ON(per_bio_data_size != 0);
> } else
> goto out;
>
> - pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
> + pools->io_pool = mempool_create_slab_pool(sysctl_dm_min_ios, cachep);
> if (!pools->io_pool)
> goto out;
>
>
> --
> Frank Mayhar
> 310-460-4042
>
>
> --
> dm-devel mailing list
> [email protected]
> https://www.redhat.com/mailman/listinfo/dm-devel
>

2013-08-20 21:28:44

by Frank Mayhar

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > The device mapper and some of its modules allocate memory pools at
> > various points when setting up a device. In some cases, these pools are
> > fairly large, for example the multipath module allocates a 256-entry
> > pool and the dm itself allocates three of that size. In a
> > memory-constrained environment where we're creating a lot of these
> > devices, the memory use can quickly become significant. Unfortunately,
> > there's currently no way to change the size of the pools other than by
> > changing a constant and rebuilding the kernel.
> I think it would be better to set the values to some low value (1 should
> be enough, there is 16 in some places that is already low enough). There
> is no need to make it user-configurable, because the user will see no
> additional benefit from bigger mempools.
>
> Maybe multipath is the exception - but other dm targets don't really need
> big mempools so there is no need to make the size configurable.

The point is not to make the mempools bigger, the point is to be able to
make them _smaller_ for memory-constrained environments. In some cases,
even 16 can be too big, especially when creating a large number of
devices.

In any event, it seems unfortunate to me that these values are
hard-coded. One shouldn't have to rebuild the kernel to change them,
IMHO.
--
Frank Mayhar
310-460-4042

2013-08-20 21:41:43

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.



On Mon, 19 Aug 2013, Mike Snitzer wrote:

> On Fri, Aug 16 2013 at 6:55pm -0400,
> Frank Mayhar <[email protected]> wrote:
>
> > The device mapper and some of its modules allocate memory pools at
> > various points when setting up a device. In some cases, these pools are
> > fairly large, for example the multipath module allocates a 256-entry
> > pool and the dm itself allocates three of that size. In a
> > memory-constrained environment where we're creating a lot of these
> > devices, the memory use can quickly become significant. Unfortunately,
> > there's currently no way to change the size of the pools other than by
> > changing a constant and rebuilding the kernel.
> >
> > This patch fixes that by changing the hardcoded MIN_IOS (and certain
> > other) #defines in dm-crypt, dm-io, dm-mpath, dm-snap and dm itself to
> > sysctl-modifiable values. This lets us change the size of these pools
> > on the fly, we can reduce the size of the pools and reduce memory
> > pressure.
>
> These memory reserves are a long-standing issue with DM (made worse when
> request-based mpath was introduced). Two years ago, I assembled a patch
> series that took one approach to trying to fix it:
> http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/series.html
>
> But in the end I wasn't convinced sharing the memory reserve would allow
> for 100s of mpath devices to make forward progress if memory is
> depleted.
>
> All said, I think adding the ability to control the size of the memory
> reserves is reasonable. It allows for informed admins to establish
> lower reserves (based on the awareness that rq-based mpath doesn't need
> to support really large IOs, etc) without compromising the ability to
> make forward progress.
>
> But, as mentioned in my porevious mail, I'd like to see this implemnted
> in terms of module_param_named().
>
> > We tested performance of dm-mpath with smaller MIN_IOS sizes for both dm
> > and dm-mpath, from a value of 32 all the way down to zero.
>
> Bio-based can safely be reduced, as this older (uncommitted) patch did:
> http://people.redhat.com/msnitzer/patches/upstream/dm-rq-based-mempool-sharing/0000-dm-lower-bio-based-reservation.patch
>
> > Bearing in mind that the underlying devices were network-based, we saw
> > essentially no performance degradation; if there was any, it was down
> > in the noise. One might wonder why these sizes are the way they are;
> > I investigated and they've been unchanged since at least 2006.
>
> Performance isn't the concern. The concern is: does DM allow for
> forward progress if the system's memory is completely exhausted?

There is one possible deadlock that was introduced in commit
d89d87965dcbe6fe4f96a2a7e8421b3a75f634d1 in 2.6.22-rc1. Unfortunatelly, no
one found that bug at that time and now it seems to be hard to revert
that.

The problem is this:

* we send bio1 to the device dm-1, device mapper splits it to bio2 and
bio3 and sends both of them to the device dm-2. These two bios are added
to current->bio_list.

* bio2 is popped off current->bio_list, a mempool entry from device dm-2's
mempool is allocated, bio4 is created and sent to the device dm-3. bio4 is
added to the end of current->bio_list.

* bio3 is popped off current->bio_list, a mempool entry from device dm-2's
mempool is allocated. Suppose that the mempool is exhausted, so we wait
until some existing work (bio2) finishes and returns the entry to the
mempool.

So: bio3's request routine waits until bio2 finishes and refills the
mempool. bio2 is waiting for bio4 to finish. bio4 is in current->bio_list
and is waiting until bio3's request routine fininshes. Deadlock.

In practice, it is not so serious because in mempool_alloc there is:
/*
* FIXME: this should be io_schedule(). The timeout is there as a
* workaround for some DM problems in 2.6.18.
*/
io_schedule_timeout(5*HZ);

- so it waits for 5 seconds and retries. If there is something in the
system that is able to free memory, it resumes.

> This is why request-based has such an extensive reserve, because it
> needs to account for cloning the largest possible request that comes in
> (with multiple bios).

Mikulas

2013-08-20 21:44:43

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.



On Mon, 19 Aug 2013, Frank Mayhar wrote:

> On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > Performance isn't the concern. The concern is: does DM allow for
> > forward progress if the system's memory is completely exhausted?
> >
> > This is why request-based has such an extensive reserve, because it
> > needs to account for cloning the largest possible request that comes in
> > (with multiple bios).
>
> Thanks for the response. In our particular case, I/O will be file
> system based and over a network, which makes it pretty easy for us to be
> sure that large I/Os never happen. That notwithstanding, however, as
> you said it just seems reasonable to make these values configurable.
>
> I'm also looking at making some similar constants in dm-verity and
> dm-bufio configurable in the same way and for similar reasons.

Regarding dm-bufio: the user of dm-bufio sets the pool size as an argument
in dm_bufio_client_create. There is no need to make it configurable - if
the user selects too low value, deadlock is possible, if the user selects
too high value, there is no additional advantage.

Regarding dm-verity: the mempool size is 4, there is no need to make it
bigger, there is no advantage from that.

Mikulas

2013-08-20 21:47:42

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.



On Tue, 20 Aug 2013, Frank Mayhar wrote:

> On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > The device mapper and some of its modules allocate memory pools at
> > > various points when setting up a device. In some cases, these pools are
> > > fairly large, for example the multipath module allocates a 256-entry
> > > pool and the dm itself allocates three of that size. In a
> > > memory-constrained environment where we're creating a lot of these
> > > devices, the memory use can quickly become significant. Unfortunately,
> > > there's currently no way to change the size of the pools other than by
> > > changing a constant and rebuilding the kernel.
> > I think it would be better to set the values to some low value (1 should
> > be enough, there is 16 in some places that is already low enough). There
> > is no need to make it user-configurable, because the user will see no
> > additional benefit from bigger mempools.
> >
> > Maybe multipath is the exception - but other dm targets don't really need
> > big mempools so there is no need to make the size configurable.
>
> The point is not to make the mempools bigger, the point is to be able to
> make them _smaller_ for memory-constrained environments. In some cases,
> even 16 can be too big, especially when creating a large number of
> devices.
>
> In any event, it seems unfortunate to me that these values are
> hard-coded. One shouldn't have to rebuild the kernel to change them,
> IMHO.

So make a patch that changes 16 to 1 if it helps on your system (I'd like
to ask - what is the configuration where this kind of change helps?).
There is no need for that to be tunable, anyone can live with mempool size
being 1.

Mikulas

2013-08-20 21:52:49

by Frank Mayhar

[permalink] [raw]
Subject: Re: [dm-devel] dm: Make MIN_IOS, et al, tunable via sysctl.

On Tue, 2013-08-20 at 17:44 -0400, Mikulas Patocka wrote:
> On Mon, 19 Aug 2013, Frank Mayhar wrote:
> > On Mon, 2013-08-19 at 10:00 -0400, Mike Snitzer wrote:
> > > Performance isn't the concern. The concern is: does DM allow for
> > > forward progress if the system's memory is completely exhausted?
> > >
> > > This is why request-based has such an extensive reserve, because it
> > > needs to account for cloning the largest possible request that comes in
> > > (with multiple bios).
> >
> > Thanks for the response. In our particular case, I/O will be file
> > system based and over a network, which makes it pretty easy for us to be
> > sure that large I/Os never happen. That notwithstanding, however, as
> > you said it just seems reasonable to make these values configurable.
> >
> > I'm also looking at making some similar constants in dm-verity and
> > dm-bufio configurable in the same way and for similar reasons.
>
> Regarding dm-bufio: the user of dm-bufio sets the pool size as an argument
> in dm_bufio_client_create. There is no need to make it configurable - if
> the user selects too low value, deadlock is possible, if the user selects
> too high value, there is no additional advantage.

True, but dm-bufio also allocates a a fixed-size 8MB (on a 64-bit
machine) hash table. I'm still getting performance data but it appears
that reducing this, even by a lot, doesn't impact performance
significantly, at least not with the workload I'm running. (Which is
using fio, random and sequential reads of varying buffer sizes.)

> Regarding dm-verity: the mempool size is 4, there is no need to make it
> bigger, there is no advantage from that.

Also true, but there may be an advantage in making it smaller.
--
Frank Mayhar
310-460-4042

2013-08-20 21:57:16

by Frank Mayhar

[permalink] [raw]
Subject: Re: [dm-devel] [PATCH] dm: Make MIN_IOS, et al, tunable via sysctl.

On Tue, 2013-08-20 at 17:47 -0400, Mikulas Patocka wrote:
>
> On Tue, 20 Aug 2013, Frank Mayhar wrote:
>
> > On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > > The device mapper and some of its modules allocate memory pools at
> > > > various points when setting up a device. In some cases, these pools are
> > > > fairly large, for example the multipath module allocates a 256-entry
> > > > pool and the dm itself allocates three of that size. In a
> > > > memory-constrained environment where we're creating a lot of these
> > > > devices, the memory use can quickly become significant. Unfortunately,
> > > > there's currently no way to change the size of the pools other than by
> > > > changing a constant and rebuilding the kernel.
> > > I think it would be better to set the values to some low value (1 should
> > > be enough, there is 16 in some places that is already low enough). There
> > > is no need to make it user-configurable, because the user will see no
> > > additional benefit from bigger mempools.
> > >
> > > Maybe multipath is the exception - but other dm targets don't really need
> > > big mempools so there is no need to make the size configurable.
> >
> > The point is not to make the mempools bigger, the point is to be able to
> > make them _smaller_ for memory-constrained environments. In some cases,
> > even 16 can be too big, especially when creating a large number of
> > devices.
> >
> > In any event, it seems unfortunate to me that these values are
> > hard-coded. One shouldn't have to rebuild the kernel to change them,
> > IMHO.
>
> So make a patch that changes 16 to 1 if it helps on your system (I'd like
> to ask - what is the configuration where this kind of change helps?).
> There is no need for that to be tunable, anyone can live with mempool size
> being 1.

I reiterate: It seems unfortunate that these values are hard-coded. It
seems to me that a sysadmin should be able to tune them according to his
or her needs. Particularly when the same kernel is being run on many
machines that may have different constraints; building a special kernel
for each set of constraints doesn't scale.

And as I said, it's a memory-constrained environment.
--
Frank Mayhar
310-460-4042

2013-08-20 22:24:43

by Mike Snitzer

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Tue, Aug 20 2013 at 5:57pm -0400,
Frank Mayhar <[email protected]> wrote:

> On Tue, 2013-08-20 at 17:47 -0400, Mikulas Patocka wrote:
> >
> > On Tue, 20 Aug 2013, Frank Mayhar wrote:
> >
> > > On Tue, 2013-08-20 at 17:22 -0400, Mikulas Patocka wrote:
> > > > On Fri, 16 Aug 2013, Frank Mayhar wrote:
> > > > > The device mapper and some of its modules allocate memory pools at
> > > > > various points when setting up a device. In some cases, these pools are
> > > > > fairly large, for example the multipath module allocates a 256-entry
> > > > > pool and the dm itself allocates three of that size. In a
> > > > > memory-constrained environment where we're creating a lot of these
> > > > > devices, the memory use can quickly become significant. Unfortunately,
> > > > > there's currently no way to change the size of the pools other than by
> > > > > changing a constant and rebuilding the kernel.
> > > > I think it would be better to set the values to some low value (1 should
> > > > be enough, there is 16 in some places that is already low enough). There
> > > > is no need to make it user-configurable, because the user will see no
> > > > additional benefit from bigger mempools.
> > > >
> > > > Maybe multipath is the exception - but other dm targets don't really need
> > > > big mempools so there is no need to make the size configurable.
> > >
> > > The point is not to make the mempools bigger, the point is to be able to
> > > make them _smaller_ for memory-constrained environments. In some cases,
> > > even 16 can be too big, especially when creating a large number of
> > > devices.
> > >
> > > In any event, it seems unfortunate to me that these values are
> > > hard-coded. One shouldn't have to rebuild the kernel to change them,
> > > IMHO.
> >
> > So make a patch that changes 16 to 1 if it helps on your system (I'd like
> > to ask - what is the configuration where this kind of change helps?).
> > There is no need for that to be tunable, anyone can live with mempool size
> > being 1.
>
> I reiterate: It seems unfortunate that these values are hard-coded. It
> seems to me that a sysadmin should be able to tune them according to his
> or her needs. Particularly when the same kernel is being run on many
> machines that may have different constraints; building a special kernel
> for each set of constraints doesn't scale.
>
> And as I said, it's a memory-constrained environment.

Mikulas' point is that you cannot reduce the size to smaller than 1.
And aside from rq-based DM, 1 is sufficient to allow for forward
progress even when memory is completely consumed.

A patch that simply changes them to 1 but makes the rq-based DM
mempool's size configurable should actually be fine.

Mike

2013-08-20 22:53:01

by Mikulas Patocka

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.



On Tue, 20 Aug 2013, Mike Snitzer wrote:

> Mikulas' point is that you cannot reduce the size to smaller than 1.
> And aside from rq-based DM, 1 is sufficient to allow for forward
> progress even when memory is completely consumed.
>
> A patch that simply changes them to 1 but makes the rq-based DM
> mempool's size configurable should actually be fine.

Yes, I think it would be viable.

Mikulas

> Mike

2013-08-20 23:14:26

by Frank Mayhar

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.

On Tue, 2013-08-20 at 18:24 -0400, Mike Snitzer wrote:
> Mikulas' point is that you cannot reduce the size to smaller than 1.
> And aside from rq-based DM, 1 is sufficient to allow for forward
> progress even when memory is completely consumed.
>
> A patch that simply changes them to 1 but makes the rq-based DM
> mempool's size configurable should actually be fine.

So you're saying that I should submit a patch to drop the pool size for
BIO_BASED to 1 and make the pool size for REQUEST_BASED configurable?
At the moment, in dm.c the former is hardcoded to 16 and the latter is
set via MIN_IOS (currently 256). There's also io_pool, a slab pool,
which is also set via MIN_IOS.

How does this relate to the rest of the DM modules? Mpath also sets
MIN_IOS to 256 and creates a slab pool from that, and there are a number
of hardcoded constants in dm-io (MIN_IOS and MIN_BIOS), dm-snap
(MIN_IOS), dm-crypt (MIN_IOS and MIN_POOL_PAGES), dm-bufio
(DM_BUFIO_HASH_BITS, which is allocated via vmalloc per client) and
dm-verity (DM_VERITY_MEMPOOL_SIZE, which is allocated per device).

For the most part I can't imagine that people will want to change these
from their defaults, but when someone does need to change one of these,
they need to do so badly and there's currently no good way to do that
besides hacking the source and building a new kernel.

By the way, I do appreciate the advice. I'm just trying to clear up
confusion on my part, make sure that our needs are met and, while I'm at
it, make things a bit better for those who come after me.
--
Frank Mayhar
310-460-4042

2013-08-26 14:28:52

by Mikulas Patocka

[permalink] [raw]
Subject: Re: dm: Make MIN_IOS, et al, tunable via sysctl.



On Tue, 20 Aug 2013, Frank Mayhar wrote:

> On Tue, 2013-08-20 at 18:24 -0400, Mike Snitzer wrote:
> > Mikulas' point is that you cannot reduce the size to smaller than 1.
> > And aside from rq-based DM, 1 is sufficient to allow for forward
> > progress even when memory is completely consumed.
> >
> > A patch that simply changes them to 1 but makes the rq-based DM
> > mempool's size configurable should actually be fine.
>
> So you're saying that I should submit a patch to drop the pool size for
> BIO_BASED to 1 and make the pool size for REQUEST_BASED configurable?
> At the moment, in dm.c the former is hardcoded to 16 and the latter is
> set via MIN_IOS (currently 256). There's also io_pool, a slab pool,
> which is also set via MIN_IOS.

In case of request-based io, yes, submit a patch that makes it
configurable.

Regarding bio-based io - there is:
pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep); allocating a
pool of 256 entries. I think it could be reduced to 16 for bio based
devices, like other pools.

As for reducing 16 further down to 1 - do you have a setup where it really
helps? Please describe your system - the amount of memory, the number of
dm devices, and how much memory is saved by reducing the mempools from 16
to 1.

> How does this relate to the rest of the DM modules? Mpath also sets
> MIN_IOS to 256 and creates a slab pool from that, and there are a number
> of hardcoded constants in dm-io (MIN_IOS and MIN_BIOS), dm-snap
> (MIN_IOS),

In dm-snap you shouldn't reduce it because it can cause failure.

> dm-crypt (MIN_IOS and MIN_POOL_PAGES), dm-bufio
> (DM_BUFIO_HASH_BITS, which is allocated via vmalloc per client) and

dm-bufio hash could be reduced - one possibility is to auto-tune it based
on device size, another possibility is to allocate shared hash table for
all devices.

> dm-verity (DM_VERITY_MEMPOOL_SIZE, which is allocated per device).
>
> For the most part I can't imagine that people will want to change these
> from their defaults, but when someone does need to change one of these,
> they need to do so badly and there's currently no good way to do that
> besides hacking the source and building a new kernel.
>
> By the way, I do appreciate the advice. I'm just trying to clear up
> confusion on my part, make sure that our needs are met and, while I'm at
> it, make things a bit better for those who come after me.
> --
> Frank Mayhar
> 310-460-4042

Mikulas