2022-06-01 20:00:34

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 00/10] mpt3sas and dmapool scalability

This patch series improves dmapool scalability by replacing linear scans
with red-black trees.

History:

In 2018 this patch series made it through 4 versions. v1 used red-black
trees; v2 - v4 put the dma pool info directly into struct page and used
virt_to_page() to get at it. v4 made a brief appearance in linux-next,
but it caused problems on non-x86 archs where virt_to_page() doesn't
work with dma_alloc_coherent, so it was reverted. I was too busy at the
time to repost the red-black tree version, and I forgot about it until
now. This version is based on the red-black trees of v1, but addressing
all the review comments I got at the time and with additional cleanup
patches.

Note that Keith Busch is also working on improving dmapool scalability,
so for now I would recommend not merging my scalability patches until
Keith's approach can be evaluated. In the meantime, my patches can
serve as a benchmark comparison. I also have a number of cleanup
patches in my series that could be useful on their own.

References:

v1
https://lore.kernel.org/linux-mm/[email protected]/

v2
https://lore.kernel.org/linux-mm/[email protected]/

v3
https://lore.kernel.org/linux-mm/[email protected]/

v4
https://lore.kernel.org/linux-mm/[email protected]/

problem caused by virt_to_page()
https://lore.kernel.org/linux-kernel/[email protected]/

Keith Busch's dmapool performance enhancements
https://lore.kernel.org/linux-mm/[email protected]/

Below is my original description of the motivation for these patches.

drivers/scsi/mpt3sas is running into a scalability problem with the
kernel's DMA pool implementation. With a LSI/Broadcom SAS 9300-8i
12Gb/s HBA and max_sgl_entries=256, during modprobe, mpt3sas does the
equivalent of:

chain_dma_pool = dma_pool_create(size = 128);
for (i = 0; i < 373959; i++)
{
dma_addr[i] = dma_pool_alloc(chain_dma_pool);
}

And at rmmod, system shutdown, or system reboot, mpt3sas does the
equivalent of:

for (i = 0; i < 373959; i++)
{
dma_pool_free(chain_dma_pool, dma_addr[i]);
}
dma_pool_destroy(chain_dma_pool);

With this usage, both dma_pool_alloc() and dma_pool_free() exhibit
O(n^2) complexity, although dma_pool_free() is much worse due to
implementation details. On my system, the dma_pool_free() loop above
takes about 9 seconds to run. Note that the problem was even worse
before commit 74522a92bbf0 ("scsi: mpt3sas: Optimize I/O memory
consumption in driver."), where the dma_pool_free() loop could take ~30
seconds.

mpt3sas also has some other DMA pools, but chain_dma_pool is the only
one with so many allocations:

cat /sys/devices/pci0000:80/0000:80:07.0/0000:85:00.0/pools
(manually cleaned up column alignment)
poolinfo - 0.1
reply_post_free_array pool 1 21 192 1
reply_free pool 1 1 41728 1
reply pool 1 1 1335296 1
sense pool 1 1 970272 1
chain pool 373959 386048 128 12064
reply_post_free pool 12 12 166528 12

The patches in this series improve the scalability of the DMA pool
implementation, which significantly reduces the running time of the
DMA alloc/free loops. With the patches applied, "modprobe mpt3sas",
"rmmod mpt3sas", and system shutdown/reboot with mpt3sas loaded are
significantly faster. Here are some benchmarks (of DMA alloc/free
only, not the entire modprobe/rmmod):

dma_pool_create() + dma_pool_alloc() loop, size = 128, count = 373959
original: 350 ms ( 1x)
dmapool patches: 18 ms (19x)

dma_pool_free() loop + dma_pool_destroy(), size = 128, count = 373959
original: 8901 ms ( 1x)
dmapool patches: 19 ms ( 477x)




2022-06-01 20:10:45

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 08/10] dmapool: cleanup dma_pool_destroy

Remove a small amount of code duplication between dma_pool_destroy() and
pool_free_page() in preparation for adding more code without having to
duplicate it. No functional changes.

Signed-off-by: Tony Battersby <[email protected]>
---
mm/dmapool.c | 34 ++++++++++++++++++++--------------
1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 8749a9d7927e..58c11dcaa4e4 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -250,14 +250,25 @@ static inline bool is_page_busy(struct dma_page *page)
return page->in_use != 0;
}

-static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+static void pool_free_page(struct dma_pool *pool,
+ struct dma_page *page,
+ bool destroying_pool)
{
+ void *vaddr = page->vaddr;
dma_addr_t dma = page->dma;

+ if (destroying_pool && is_page_busy(page)) {
+ dev_err(pool->dev,
+ "dma_pool_destroy %s, %p busy\n",
+ pool->name, vaddr);
+ /* leak the still-in-use consistent memory */
+ } else {
#ifdef DMAPOOL_DEBUG
- memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+ memset(vaddr, POOL_POISON_FREED, pool->allocation);
#endif
- dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+ dma_free_coherent(pool->dev, pool->allocation, vaddr, dma);
+ }
+
list_del(&page->page_list);
kfree(page);
}
@@ -272,7 +283,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
*/
void dma_pool_destroy(struct dma_pool *pool)
{
- struct dma_page *page, *tmp;
+ struct dma_page *page;
bool empty = false;

if (unlikely(!pool))
@@ -288,15 +299,10 @@ void dma_pool_destroy(struct dma_pool *pool)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);

- list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
- if (is_page_busy(page)) {
- dev_err(pool->dev, "%s %s, %p busy\n", __func__,
- pool->name, page->vaddr);
- /* leak the still-in-use consistent memory */
- list_del(&page->page_list);
- kfree(page);
- } else
- pool_free_page(pool, page);
+ while ((page = list_first_entry_or_null(&pool->page_list,
+ struct dma_page,
+ page_list))) {
+ pool_free_page(pool, page, true);
}

kfree(pool);
@@ -469,7 +475,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->offset = offset;
/*
* Resist a temptation to do
- * if (!is_page_busy(page)) pool_free_page(pool, page);
+ * if (!is_page_busy(page)) pool_free_page(pool, page, false);
* Better have a few empty pages hang around.
*/
spin_unlock_irqrestore(&pool->lock, flags);
--
2.25.1


2022-06-01 20:44:02

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 04/10] dmapool: improve accuracy of debug statistics

The "total number of blocks in pool" debug statistic currently does not
take the boundary value into account, so it diverges from the "total
number of blocks in use" statistic when a boundary is in effect. Add a
calculation for the number of blocks per allocation that takes the
boundary into account, and use it to replace the inaccurate calculation.

This depends on the patch "dmapool: fix boundary comparison" for the
calculated blks_per_alloc value to be correct.

Signed-off-by: Tony Battersby <[email protected]>
---
mm/dmapool.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 782143144a32..9e30f4425dea 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -47,6 +47,7 @@ struct dma_pool { /* the pool */
struct device *dev;
unsigned int allocation;
unsigned int boundary;
+ unsigned int blks_per_alloc;
char name[32];
struct list_head pools;
};
@@ -92,8 +93,7 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
/* per-pool info, no real statistics yet */
temp = scnprintf(next, size, "%-16s %4zu %4zu %4u %2u\n",
pool->name, blocks,
- (size_t) pages *
- (pool->allocation / pool->size),
+ (size_t) pages * pool->blks_per_alloc,
pool->size, pages);
size -= temp;
next += temp;
@@ -168,6 +168,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->blks_per_alloc =
+ (allocation / boundary) * (boundary / size) +
+ (allocation % boundary) / size;

INIT_LIST_HEAD(&retval->pools);

--
2.25.1


2022-06-01 20:52:40

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 07/10] dmapool: speedup DMAPOOL_DEBUG with init_on_alloc

Avoid double-memset of the same allocated memory in dma_pool_alloc()
when both DMAPOOL_DEBUG is enabled and init_on_alloc=1.

Signed-off-by: Tony Battersby <[email protected]>
---
mm/dmapool.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 49019ef6dd83..8749a9d7927e 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -365,7 +365,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
break;
}
}
- if (!(mem_flags & __GFP_ZERO))
+ if (!want_init_on_alloc(mem_flags))
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
spin_unlock_irqrestore(&pool->lock, flags);
--
2.25.1


2022-06-01 20:58:41

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 09/10] dmapool: improve scalability of dma_pool_alloc

dma_pool_alloc() scales poorly when allocating a large number of pages
because it does a linear scan of all previously-allocated pages before
allocating a new one. Improve its scalability by maintaining a separate
list of pages that have free blocks ready to (re)allocate. In big O
notation, this improves the algorithm from O(n^2) to O(n).

Signed-off-by: Tony Battersby <[email protected]>
---
mm/dmapool.c | 27 +++++++++++++++++++++++----
1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index 58c11dcaa4e4..b3dd2ace0d2a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -17,6 +17,10 @@
* least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
* list of free blocks within the page. Used blocks aren't tracked, but we
* keep a count of how many are currently allocated from each page.
+ *
+ * The avail_page_list keeps track of pages that have one or more free blocks
+ * available to (re)allocate. Pages are moved in and out of avail_page_list
+ * as their blocks are allocated and freed.
*/

#include <linux/device.h>
@@ -42,6 +46,7 @@

struct dma_pool { /* the pool */
struct list_head page_list;
+ struct list_head avail_page_list;
spinlock_t lock;
unsigned int size;
struct device *dev;
@@ -54,6 +59,7 @@ struct dma_pool { /* the pool */

struct dma_page { /* cacheable header for 'allocation' bytes */
struct list_head page_list;
+ struct list_head avail_page_link;
void *vaddr;
dma_addr_t dma;
unsigned int in_use;
@@ -164,6 +170,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->dev = dev;

INIT_LIST_HEAD(&retval->page_list);
+ INIT_LIST_HEAD(&retval->avail_page_list);
spin_lock_init(&retval->lock);
retval->size = size;
retval->boundary = boundary;
@@ -270,6 +277,7 @@ static void pool_free_page(struct dma_pool *pool,
}

list_del(&page->page_list);
+ list_del(&page->avail_page_link);
kfree(page);
}

@@ -330,10 +338,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_alloc(mem_flags);

spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry(page, &pool->page_list, page_list) {
- if (page->offset < pool->allocation)
- goto ready;
- }
+ page = list_first_entry_or_null(&pool->avail_page_list,
+ struct dma_page,
+ avail_page_link);
+ if (page)
+ goto ready;

/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
spin_unlock_irqrestore(&pool->lock, flags);
@@ -345,10 +354,13 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
spin_lock_irqsave(&pool->lock, flags);

list_add(&page->page_list, &pool->page_list);
+ list_add(&page->avail_page_link, &pool->avail_page_list);
ready:
page->in_use++;
offset = page->offset;
page->offset = *(int *)(page->vaddr + offset);
+ if (page->offset >= pool->allocation)
+ list_del_init(&page->avail_page_link);
retval = offset + page->vaddr;
*handle = offset + page->dma;
#ifdef DMAPOOL_DEBUG
@@ -470,6 +482,13 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, 0, pool->size);
#endif

+ /*
+ * list_empty() on the page tests if the page is already linked into
+ * avail_page_list to avoid adding it more than once.
+ */
+ if (list_empty(&page->avail_page_link))
+ list_add(&page->avail_page_link, &pool->avail_page_list);
+
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
--
2.25.1


2022-06-01 21:14:17

by Tony Battersby

[permalink] [raw]
Subject: [PATCH 10/10] dmapool: improve scalability of dma_pool_free

dma_pool_free() scales poorly when the pool contains many pages because
pool_find_page() does a linear scan of all allocated pages. Improve its
scalability by replacing the linear scan with a red-black tree lookup.
In big O notation, this improves the algorithm from O(n^2) to O(n * log n).

Signed-off-by: Tony Battersby <[email protected]>
---
mm/dmapool.c | 128 ++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 100 insertions(+), 28 deletions(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index b3dd2ace0d2a..24535483f781 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -12,11 +12,12 @@
* Many older drivers still have their own code to do this.
*
* The current design of this allocator is fairly simple. The pool is
- * represented by the 'struct dma_pool' which keeps a doubly-linked list of
- * allocated pages. Each page in the page_list is split into blocks of at
- * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
- * list of free blocks within the page. Used blocks aren't tracked, but we
- * keep a count of how many are currently allocated from each page.
+ * represented by the 'struct dma_pool' which keeps a red-black tree of all
+ * allocated pages, keyed by DMA address for fast lookup when freeing.
+ * Each page in the page_tree is split into blocks of at least 'size' bytes.
+ * Free blocks are tracked in an unsorted singly-linked list of free blocks
+ * within the page. Used blocks aren't tracked, but we keep a count of how
+ * many are currently allocated from each page.
*
* The avail_page_list keeps track of pages that have one or more free blocks
* available to (re)allocate. Pages are moved in and out of avail_page_list
@@ -36,6 +37,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/spinlock.h>
+#include <linux/rbtree.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/wait.h>
@@ -45,7 +47,7 @@
#endif

struct dma_pool { /* the pool */
- struct list_head page_list;
+ struct rb_root page_tree;
struct list_head avail_page_list;
spinlock_t lock;
unsigned int size;
@@ -58,7 +60,7 @@ struct dma_pool { /* the pool */
};

struct dma_page { /* cacheable header for 'allocation' bytes */
- struct list_head page_list;
+ struct rb_node page_node;
struct list_head avail_page_link;
void *vaddr;
dma_addr_t dma;
@@ -69,6 +71,11 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
static DEFINE_MUTEX(pools_lock);
static DEFINE_MUTEX(pools_reg_lock);

+static inline struct dma_page *rb_to_dma_page(struct rb_node *node)
+{
+ return rb_entry(node, struct dma_page, page_node);
+}
+
static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf)
{
unsigned temp;
@@ -76,6 +83,7 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
char *next;
struct dma_page *page;
struct dma_pool *pool;
+ struct rb_node *node;

next = buf;
size = PAGE_SIZE;
@@ -90,7 +98,10 @@ static ssize_t pools_show(struct device *dev, struct device_attribute *attr, cha
size_t blocks = 0;

spin_lock_irq(&pool->lock);
- list_for_each_entry(page, &pool->page_list, page_list) {
+ for (node = rb_first(&pool->page_tree);
+ node;
+ node = rb_next(node)) {
+ page = rb_to_dma_page(node);
pages++;
blocks += page->in_use;
}
@@ -169,7 +180,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,

retval->dev = dev;

- INIT_LIST_HEAD(&retval->page_list);
+ retval->page_tree = RB_ROOT;
INIT_LIST_HEAD(&retval->avail_page_list);
spin_lock_init(&retval->lock);
retval->size = size;
@@ -213,6 +224,63 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
}
EXPORT_SYMBOL(dma_pool_create);

+/*
+ * Find the dma_page that manages the given DMA address.
+ */
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+ struct rb_node *node = pool->page_tree.rb_node;
+
+ while (node) {
+ struct dma_page *page = rb_to_dma_page(node);
+
+ if (dma < page->dma)
+ node = node->rb_left;
+ else if ((dma - page->dma) >= pool->allocation)
+ node = node->rb_right;
+ else
+ return page;
+ }
+ return NULL;
+}
+
+/*
+ * Insert a dma_page into the page_tree.
+ */
+static int pool_insert_page(struct dma_pool *pool, struct dma_page *new_page)
+{
+ dma_addr_t dma = new_page->dma;
+ struct rb_node **node = &(pool->page_tree.rb_node), *parent = NULL;
+
+ while (*node) {
+ struct dma_page *this_page = rb_to_dma_page(*node);
+
+ parent = *node;
+ if (dma < this_page->dma)
+ node = &((*node)->rb_left);
+ else if (likely((dma - this_page->dma) >= pool->allocation))
+ node = &((*node)->rb_right);
+ else {
+ /*
+ * A page that overlaps the new DMA range is already
+ * present in the tree. This should not happen.
+ */
+ WARN(1,
+ "%s: %s: DMA address overlap: old %pad new %pad len %u\n",
+ dev_name(pool->dev),
+ pool->name, &this_page->dma, &dma,
+ pool->allocation);
+ return -1;
+ }
+ }
+
+ /* Add new node and rebalance tree. */
+ rb_link_node(&new_page->page_node, parent, node);
+ rb_insert_color(&new_page->page_node, &pool->page_tree);
+
+ return 0;
+}
+
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
unsigned int offset = 0;
@@ -276,8 +344,16 @@ static void pool_free_page(struct dma_pool *pool,
dma_free_coherent(pool->dev, pool->allocation, vaddr, dma);
}

- list_del(&page->page_list);
list_del(&page->avail_page_link);
+
+ /*
+ * If the pool is being destroyed, it is not safe to modify the
+ * page_tree while iterating over it, and it is also unnecessary since
+ * the whole tree will be discarded anyway.
+ */
+ if (!destroying_pool)
+ rb_erase(&page->page_node, &pool->page_tree);
+
kfree(page);
}

@@ -291,7 +367,7 @@ static void pool_free_page(struct dma_pool *pool,
*/
void dma_pool_destroy(struct dma_pool *pool)
{
- struct dma_page *page;
+ struct dma_page *page, *tmp;
bool empty = false;

if (unlikely(!pool))
@@ -307,9 +383,10 @@ void dma_pool_destroy(struct dma_pool *pool)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);

- while ((page = list_first_entry_or_null(&pool->page_list,
- struct dma_page,
- page_list))) {
+ rbtree_postorder_for_each_entry_safe(page,
+ tmp,
+ &pool->page_tree,
+ page_node) {
pool_free_page(pool, page, true);
}

@@ -353,7 +430,15 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,

spin_lock_irqsave(&pool->lock, flags);

- list_add(&page->page_list, &pool->page_list);
+ if (unlikely(pool_insert_page(pool, page))) {
+ /*
+ * This should not happen, so something must have gone horribly
+ * wrong. Instead of crashing, intentionally leak the memory
+ * and make for the exit.
+ */
+ spin_unlock_irqrestore(&pool->lock, flags);
+ return NULL;
+ }
list_add(&page->avail_page_link, &pool->avail_page_list);
ready:
page->in_use++;
@@ -395,19 +480,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
}
EXPORT_SYMBOL(dma_pool_alloc);

-static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
-{
- struct dma_page *page;
-
- list_for_each_entry(page, &pool->page_list, page_list) {
- if (dma < page->dma)
- continue;
- if ((dma - page->dma) < pool->allocation)
- return page;
- }
- return NULL;
-}
-
/**
* dma_pool_free - put block back into dma pool
* @pool: the dma pool holding the block
--
2.25.1