2019-11-20 19:28:09

by Scott Cheloha

[permalink] [raw]
Subject: [PATCH] memory subsystem: cache memory blocks in radix tree to accelerate lookup

Searching for a particular memory block by id is slow because each block
device is kept in an unsorted linked list on the subsystem bus.

Lookup is much faster if we cache the blocks in a radix tree. Memory
subsystem initialization and hotplug/hotunplug is at least a little faster
for any machine with more than ~100 blocks, and the speedup grows with
the block count.

Signed-off-by: Scott Cheloha <[email protected]>
---
On a 40GB Power9 VM I'm seeing nice initialization speed improvements
consistent with the change in data structure. Here's the per-block
timings before the patch:

# uptime elapsed block-id
0.005121 0.000033 0
0.005154 0.000028 1
0.005182 0.000035 2
0.005217 0.000030 3
0.005247 0.000039 4
0.005286 0.000031 5
0.005317 0.000030 6
0.005347 0.000031 7
0.005378 0.000030 8
0.005408 0.000031 9
[...]
0.091603 0.000143 999
0.091746 0.000175 1000
0.091921 0.000143 1001
0.092064 0.000142 1002
0.092206 0.000143 1003
0.092349 0.000143 1004
0.092492 0.000143 1005
0.092635 0.000144 1006
0.092779 0.000143 1007
0.092922 0.000144 1008
[...]
0.301879 0.000258 2038
0.302137 0.000267 2039
0.302404 0.000291 2040
0.302695 0.000259 2041
0.302954 0.000258 2042
0.303212 0.000259 2043
0.303471 0.000260 2044
0.303731 0.000258 2045
0.303989 0.000259 2046
0.304248 0.000260 2047

Obviously a linear growth with each block.

With the patch:

# uptime elapsed block-id
0.004701 0.000029 0
0.004730 0.000028 1
0.004758 0.000028 2
0.004786 0.000027 3
0.004813 0.000037 4
0.004850 0.000027 5
0.004877 0.000027 6
0.004904 0.000027 7
0.004931 0.000026 8
0.004957 0.000027 9
[...]
0.032718 0.000027 999
0.032745 0.000027 1000
0.032772 0.000026 1001
0.032798 0.000027 1002
0.032825 0.000027 1003
0.032852 0.000026 1004
0.032878 0.000027 1005
0.032905 0.000027 1006
0.032932 0.000026 1007
0.032958 0.000027 1008
[...]
0.061148 0.000027 2038
0.061175 0.000027 2039
0.061202 0.000026 2040
0.061228 0.000027 2041
0.061255 0.000027 2042
0.061282 0.000026 2043
0.061308 0.000027 2044
0.061335 0.000026 2045
0.061361 0.000026 2046
0.061387 0.000027 2047

It flattens out.

I'm seeing similar changes on my development PC but the numbers are
less drastic because the block size on a PC grows with the amount
of memory. On powerpc the gains are a lot more visible because the
block size tops out at 256MB.

drivers/base/memory.c | 53 ++++++++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 84c4e1f72cbd..fc0a4880c321 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -20,6 +20,7 @@
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/radix-tree.h>
#include <linux/stat.h>
#include <linux/slab.h>

@@ -59,6 +60,13 @@ static struct bus_type memory_subsys = {
.offline = memory_subsys_offline,
};

+/*
+ * Memory blocks are cached in a local radix tree to avoid
+ * a costly linear search for the corresponding device on
+ * the subsystem bus.
+ */
+static RADIX_TREE(memory_block_tree, GFP_KERNEL);
+
static BLOCKING_NOTIFIER_HEAD(memory_chain);

int register_memory_notifier(struct notifier_block *nb)
@@ -580,20 +588,14 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
/* A reference for the returned memory block device is acquired. */
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
{
- struct device *dev;
+ struct memory_block *mem;

- dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
- return dev ? to_memory_block(dev) : NULL;
+ mem = radix_tree_lookup(&memory_block_tree, block_id);
+ if (mem)
+ get_device(&mem->dev);
+ return mem;
}

-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all device subsystems.
- */
struct memory_block *find_memory_block(struct mem_section *section)
{
unsigned long block_id = base_memory_block_id(__section_nr(section));
@@ -636,9 +638,15 @@ int register_memory(struct memory_block *memory)
memory->dev.offline = memory->state == MEM_OFFLINE;

ret = device_register(&memory->dev);
- if (ret)
+ if (ret) {
put_device(&memory->dev);
-
+ return ret;
+ }
+ ret = radix_tree_insert(&memory_block_tree, memory->dev.id, memory);
+ if (ret) {
+ put_device(&memory->dev);
+ device_unregister(&memory->dev);
+ }
return ret;
}

@@ -696,6 +704,8 @@ static void unregister_memory(struct memory_block *memory)
if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
return;

+ WARN_ON(radix_tree_delete(&memory_block_tree, memory->dev.id) == NULL);
+
/* drop the ref. we got via find_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
@@ -851,20 +861,21 @@ void __init memory_dev_init(void)
int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func)
{
- const unsigned long start_block_id = phys_to_block_id(start);
- const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+ struct radix_tree_iter iter;
+ const unsigned long start_id = phys_to_block_id(start);
+ const unsigned long end_id = phys_to_block_id(start + size - 1);
struct memory_block *mem;
- unsigned long block_id;
+ void **slot;
int ret = 0;

if (!size)
return 0;

- for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
- mem = find_memory_block_by_id(block_id);
- if (!mem)
- continue;
-
+ radix_tree_for_each_slot(slot, &memory_block_tree, &iter, start_id) {
+ mem = radix_tree_deref_slot(slot);
+ if (mem->dev.id > end_id)
+ break;
+ get_device(&mem->dev);
ret = func(mem, arg);
put_device(&mem->dev);
if (ret)
--
2.24.0.rc1



2019-11-21 09:35:31

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH] memory subsystem: cache memory blocks in radix tree to accelerate lookup

On 20.11.19 20:25, Scott Cheloha wrote:
> Searching for a particular memory block by id is slow because each block
> device is kept in an unsorted linked list on the subsystem bus.
>
> Lookup is much faster if we cache the blocks in a radix tree. Memory
> subsystem initialization and hotplug/hotunplug is at least a little faster
> for any machine with more than ~100 blocks, and the speedup grows with
> the block count.
>
> Signed-off-by: Scott Cheloha <[email protected]>
> ---
> On a 40GB Power9 VM I'm seeing nice initialization speed improvements
> consistent with the change in data structure. Here's the per-block
> timings before the patch:
>
> # uptime elapsed block-id
> 0.005121 0.000033 0
> 0.005154 0.000028 1
> 0.005182 0.000035 2
> 0.005217 0.000030 3
> 0.005247 0.000039 4
> 0.005286 0.000031 5
> 0.005317 0.000030 6
> 0.005347 0.000031 7
> 0.005378 0.000030 8
> 0.005408 0.000031 9
> [...]
> 0.091603 0.000143 999
> 0.091746 0.000175 1000
> 0.091921 0.000143 1001
> 0.092064 0.000142 1002
> 0.092206 0.000143 1003
> 0.092349 0.000143 1004
> 0.092492 0.000143 1005
> 0.092635 0.000144 1006
> 0.092779 0.000143 1007
> 0.092922 0.000144 1008
> [...]
> 0.301879 0.000258 2038
> 0.302137 0.000267 2039
> 0.302404 0.000291 2040
> 0.302695 0.000259 2041
> 0.302954 0.000258 2042
> 0.303212 0.000259 2043
> 0.303471 0.000260 2044
> 0.303731 0.000258 2045
> 0.303989 0.000259 2046
> 0.304248 0.000260 2047
>
> Obviously a linear growth with each block.
>
> With the patch:
>
> # uptime elapsed block-id
> 0.004701 0.000029 0
> 0.004730 0.000028 1
> 0.004758 0.000028 2
> 0.004786 0.000027 3
> 0.004813 0.000037 4
> 0.004850 0.000027 5
> 0.004877 0.000027 6
> 0.004904 0.000027 7
> 0.004931 0.000026 8
> 0.004957 0.000027 9
> [...]
> 0.032718 0.000027 999
> 0.032745 0.000027 1000
> 0.032772 0.000026 1001
> 0.032798 0.000027 1002
> 0.032825 0.000027 1003
> 0.032852 0.000026 1004
> 0.032878 0.000027 1005
> 0.032905 0.000027 1006
> 0.032932 0.000026 1007
> 0.032958 0.000027 1008
> [...]
> 0.061148 0.000027 2038
> 0.061175 0.000027 2039
> 0.061202 0.000026 2040
> 0.061228 0.000027 2041
> 0.061255 0.000027 2042
> 0.061282 0.000026 2043
> 0.061308 0.000027 2044
> 0.061335 0.000026 2045
> 0.061361 0.000026 2046
> 0.061387 0.000027 2047
>
> It flattens out.
>
> I'm seeing similar changes on my development PC but the numbers are
> less drastic because the block size on a PC grows with the amount
> of memory. On powerpc the gains are a lot more visible because the
> block size tops out at 256MB.

One could argue that this is only needed for bigger machines. If we ever
care, we could defer filling/using the tree once a certain block count
is reached. Future work if we really care.

>
> drivers/base/memory.c | 53 ++++++++++++++++++++++++++-----------------
> 1 file changed, 32 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 84c4e1f72cbd..fc0a4880c321 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -20,6 +20,7 @@
> #include <linux/memory_hotplug.h>
> #include <linux/mm.h>
> #include <linux/mutex.h>
> +#include <linux/radix-tree.h>
> #include <linux/stat.h>
> #include <linux/slab.h>
>
> @@ -59,6 +60,13 @@ static struct bus_type memory_subsys = {
> .offline = memory_subsys_offline,
> };
>
> +/*
> + * Memory blocks are cached in a local radix tree to avoid
> + * a costly linear search for the corresponding device on
> + * the subsystem bus.
> + */
> +static RADIX_TREE(memory_block_tree, GFP_KERNEL);

simply memory_blocks or memory_block_devices?

> +
> static BLOCKING_NOTIFIER_HEAD(memory_chain);
>
> int register_memory_notifier(struct notifier_block *nb)
> @@ -580,20 +588,14 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
> /* A reference for the returned memory block device is acquired. */
> static struct memory_block *find_memory_block_by_id(unsigned long block_id)
> {
> - struct device *dev;
> + struct memory_block *mem;
>
> - dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
> - return dev ? to_memory_block(dev) : NULL;
> + mem = radix_tree_lookup(&memory_block_tree, block_id);
> + if (mem)
> + get_device(&mem->dev);
> + return mem;
> }
>
> -/*
> - * For now, we have a linear search to go find the appropriate
> - * memory_block corresponding to a particular phys_index. If
> - * this gets to be a real problem, we can always use a radix
> - * tree or something here.
> - *
> - * This could be made generic for all device subsystems.
> - */

As this really only makes sense for subsystems with a lot of devices, I
think it is the right thing to do to keep it local in here for now.

> struct memory_block *find_memory_block(struct mem_section *section)
> {
> unsigned long block_id = base_memory_block_id(__section_nr(section));
> @@ -636,9 +638,15 @@ int register_memory(struct memory_block *memory)
> memory->dev.offline = memory->state == MEM_OFFLINE;
>
> ret = device_register(&memory->dev);
> - if (ret)
> + if (ret) {
> put_device(&memory->dev);
> -
> + return ret;
> + }
> + ret = radix_tree_insert(&memory_block_tree, memory->dev.id, memory);
> + if (ret) {
> + put_device(&memory->dev);
> + device_unregister(&memory->dev);
> + }
> return ret;
> }
>
> @@ -696,6 +704,8 @@ static void unregister_memory(struct memory_block *memory)
> if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
> return;
>
> + WARN_ON(radix_tree_delete(&memory_block_tree, memory->dev.id) == NULL);
> +
> /* drop the ref. we got via find_memory_block() */
> put_device(&memory->dev);
> device_unregister(&memory->dev);
> @@ -851,20 +861,21 @@ void __init memory_dev_init(void)
> int walk_memory_blocks(unsigned long start, unsigned long size,
> void *arg, walk_memory_blocks_func_t func)
> {
> - const unsigned long start_block_id = phys_to_block_id(start);
> - const unsigned long end_block_id = phys_to_block_id(start + size - 1);
> + struct radix_tree_iter iter;
> + const unsigned long start_id = phys_to_block_id(start);
> + const unsigned long end_id = phys_to_block_id(start + size - 1);

Please don't rename these two variables, unrelated change.

> struct memory_block *mem;
> - unsigned long block_id;
> + void **slot;
> int ret = 0;
>
> if (!size)
> return 0;
>
> - for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
> - mem = find_memory_block_by_id(block_id);
> - if (!mem)
> - continue;
> -
> + radix_tree_for_each_slot(slot, &memory_block_tree, &iter, start_id) {
> + mem = radix_tree_deref_slot(slot);
> + if (mem->dev.id > end_id)
> + break;

I think you could do a

if (iter.index > end_id)
break;
mem = radix_tree_deref_slot(slot);

instead, which would be nicer IMHO.

> + get_device(&mem->dev);
> ret = func(mem, arg);
> put_device(&mem->dev);

I think we can stop doing the get/put here (similar as in
for_each_memory_block() now), this function should only be called with
the device hotplug lock held (or early during boot), where no concurrent
hot(un)plug can happen. I suggest this change as an addon patch.

> if (ret)
>

With the changes

Acked-by: David Hildenbrand <[email protected]>

Thanks!

--

Thanks,

David / dhildenb

2019-11-21 09:37:58

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH] memory subsystem: cache memory blocks in radix tree to accelerate lookup

On 20.11.19 20:25, Scott Cheloha wrote:
> Searching for a particular memory block by id is slow because each block
> device is kept in an unsorted linked list on the subsystem bus.

Oh, and you patch subject should be

"drivers/base/memory.c: cache memory [...]"

--

Thanks,

David / dhildenb

2019-11-21 20:02:58

by Scott Cheloha

[permalink] [raw]
Subject: [PATCH v2] drivers/base/memory.c: cache blocks in radix tree to accelerate lookup

Searching for a particular memory block by id is slow because each block
device is kept in an unsorted linked list on the subsystem bus.

Lookup is much faster if we cache the blocks in a radix tree. Memory
subsystem initialization and hotplug/hotunplug is at least a little faster
for any machine with more than ~100 blocks, and the speedup grows with
the block count.

Signed-off-by: Scott Cheloha <[email protected]>
Acked-by: David Hildenbrand <[email protected]>
---
v2 incorporates suggestions from David Hildenbrand.

Removing the get_device/put_device dance from walk_memory_blocks() can
be done in a subsequent patch.
drivers/base/memory.c | 49 ++++++++++++++++++++++++++-----------------
1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 84c4e1f72cbd..32e74fc1b0ac 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -20,6 +20,7 @@
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/radix-tree.h>
#include <linux/stat.h>
#include <linux/slab.h>

@@ -59,6 +60,13 @@ static struct bus_type memory_subsys = {
.offline = memory_subsys_offline,
};

+/*
+ * Memory blocks are cached in a local radix tree to avoid
+ * a costly linear search for the corresponding device on
+ * the subsystem bus.
+ */
+static RADIX_TREE(memory_blocks, GFP_KERNEL);
+
static BLOCKING_NOTIFIER_HEAD(memory_chain);

int register_memory_notifier(struct notifier_block *nb)
@@ -580,20 +588,14 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
/* A reference for the returned memory block device is acquired. */
static struct memory_block *find_memory_block_by_id(unsigned long block_id)
{
- struct device *dev;
+ struct memory_block *mem;

- dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
- return dev ? to_memory_block(dev) : NULL;
+ mem = radix_tree_lookup(&memory_blocks, block_id);
+ if (mem)
+ get_device(&mem->dev);
+ return mem;
}

-/*
- * For now, we have a linear search to go find the appropriate
- * memory_block corresponding to a particular phys_index. If
- * this gets to be a real problem, we can always use a radix
- * tree or something here.
- *
- * This could be made generic for all device subsystems.
- */
struct memory_block *find_memory_block(struct mem_section *section)
{
unsigned long block_id = base_memory_block_id(__section_nr(section));
@@ -636,9 +638,15 @@ int register_memory(struct memory_block *memory)
memory->dev.offline = memory->state == MEM_OFFLINE;

ret = device_register(&memory->dev);
- if (ret)
+ if (ret) {
put_device(&memory->dev);
-
+ return ret;
+ }
+ ret = radix_tree_insert(&memory_blocks, memory->dev.id, memory);
+ if (ret) {
+ put_device(&memory->dev);
+ device_unregister(&memory->dev);
+ }
return ret;
}

@@ -696,6 +704,8 @@ static void unregister_memory(struct memory_block *memory)
if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
return;

+ WARN_ON(radix_tree_delete(&memory_blocks, memory->dev.id) == NULL);
+
/* drop the ref. we got via find_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
@@ -851,20 +861,21 @@ void __init memory_dev_init(void)
int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func)
{
+ struct radix_tree_iter iter;
const unsigned long start_block_id = phys_to_block_id(start);
const unsigned long end_block_id = phys_to_block_id(start + size - 1);
struct memory_block *mem;
- unsigned long block_id;
+ void **slot;
int ret = 0;

if (!size)
return 0;

- for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
- mem = find_memory_block_by_id(block_id);
- if (!mem)
- continue;
-
+ radix_tree_for_each_slot(slot, &memory_blocks, &iter, start_block_id) {
+ if (iter.index > end_block_id)
+ break;
+ mem = radix_tree_deref_slot(slot);
+ get_device(&mem->dev);
ret = func(mem, arg);
put_device(&mem->dev);
if (ret)
--
2.24.0.rc1

2019-11-25 06:39:38

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] drivers/base/memory.c: cache blocks in radix tree to accelerate lookup

Hi Scott,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on driver-core/driver-core-testing]
[also build test WARNING on v5.4]
[cannot apply to next-20191122]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url: https://github.com/0day-ci/linux/commits/Scott-Cheloha/drivers-base-memory-c-cache-blocks-in-radix-tree-to-accelerate-lookup/20191124-104557
base: https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git 0e4a459f56c32d3e52ae69a4b447db2f48a65f44
reproduce:
# apt-get install sparse
# sparse version: v0.6.1-36-g9305d48-dirty
make ARCH=x86_64 allmodconfig
make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>


sparse warnings: (new ones prefixed by >>)

>> drivers/base/memory.c:874:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void **slot @@ got void [noderef] <asvoid **slot @@
>> drivers/base/memory.c:874:9: sparse: expected void **slot
>> drivers/base/memory.c:874:9: sparse: got void [noderef] <asn:4> **
>> drivers/base/memory.c:874:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void **slot @@ got void [noderef] <asvoid **slot @@
>> drivers/base/memory.c:874:9: sparse: expected void **slot
>> drivers/base/memory.c:874:9: sparse: got void [noderef] <asn:4> **
>> drivers/base/memory.c:877:45: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void [noderef] <asn:4> **slot @@ got n:4> **slot @@
>> drivers/base/memory.c:877:45: sparse: expected void [noderef] <asn:4> **slot
>> drivers/base/memory.c:877:45: sparse: got void **slot
drivers/base/memory.c:874:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void [noderef] <asn:4> **slot @@ got n:4> **slot @@
drivers/base/memory.c:874:9: sparse: expected void [noderef] <asn:4> **slot
drivers/base/memory.c:874:9: sparse: got void **slot
>> drivers/base/memory.c:874:9: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void **slot @@ got void [noderef] <asvoid **slot @@
>> drivers/base/memory.c:874:9: sparse: expected void **slot
>> drivers/base/memory.c:874:9: sparse: got void [noderef] <asn:4> **

vim +874 drivers/base/memory.c

845
846 /**
847 * walk_memory_blocks - walk through all present memory blocks overlapped
848 * by the range [start, start + size)
849 *
850 * @start: start address of the memory range
851 * @size: size of the memory range
852 * @arg: argument passed to func
853 * @func: callback for each memory section walked
854 *
855 * This function walks through all present memory blocks overlapped by the
856 * range [start, start + size), calling func on each memory block.
857 *
858 * In case func() returns an error, walking is aborted and the error is
859 * returned.
860 */
861 int walk_memory_blocks(unsigned long start, unsigned long size,
862 void *arg, walk_memory_blocks_func_t func)
863 {
864 struct radix_tree_iter iter;
865 const unsigned long start_block_id = phys_to_block_id(start);
866 const unsigned long end_block_id = phys_to_block_id(start + size - 1);
867 struct memory_block *mem;
868 void **slot;
869 int ret = 0;
870
871 if (!size)
872 return 0;
873
> 874 radix_tree_for_each_slot(slot, &memory_blocks, &iter, start_block_id) {
875 if (iter.index > end_block_id)
876 break;
> 877 mem = radix_tree_deref_slot(slot);

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation