Large memory systems (~1TB or more) experience boot delays on the order
of minutes due to the initializing the memory configuration part of
sysfs at /sys/devices/system/memory/.
ppc64 has a normal memory block size of 256M (however sometimes as low
as 16M depending on the system LMB size), and (I think) x86 is 128M. With
1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
entries per block that's around 80k items that need be created at boot
time in sysfs. Some systems go up to 16TB where the issue is even more
severe.
This patch provides a means by which users can prevent the creation of
the memory block attributes at boot time, yet still dynamically create
them if they are needed.
This patch creates a new boot parameter, "largememory" that will prevent
memory_dev_init() from creating all of the memory block sysfs attributes
at boot time. Instead, a new root attribute "show" will allow
the dynamic creation of the memory block devices.
Another new root attribute "present" shows the memory blocks present in
the system; the valid inputs for the "show" attribute.
There was a significant amount of refactoring to allow for this but
IMHO, the code is much easier to understand now.
Signed-off-by: Seth Jennings <[email protected]>
---
Reviewer/Maintainer Notes:
This is a replacement for my previous RFC and extends the existing memory
sysfs API rather than introducing an alternate layout.
drivers/base/memory.c | 248 +++++++++++++++++++++++++++++++++++++------------
include/linux/memory.h | 1 -
2 files changed, 188 insertions(+), 61 deletions(-)
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 2b7813e..392ccd3 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -30,7 +30,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
#define MEMORY_CLASS_NAME "memory"
-static int sections_per_block;
+static int sections_per_block __read_mostly;
static inline int base_memory_block_id(int section_nr)
{
@@ -47,6 +47,9 @@ static struct bus_type memory_subsys = {
.offline = memory_subsys_offline,
};
+static unsigned long *memblock_present;
+static bool largememory_enable __read_mostly;
+
static BLOCKING_NOTIFIER_HEAD(memory_chain);
int register_memory_notifier(struct notifier_block *nb)
@@ -565,16 +568,13 @@ static const struct attribute_group *memory_memblk_attr_groups[] = {
static
int register_memory(struct memory_block *memory)
{
- int error;
-
memory->dev.bus = &memory_subsys;
memory->dev.id = memory->start_section_nr / sections_per_block;
memory->dev.release = memory_block_release;
memory->dev.groups = memory_memblk_attr_groups;
memory->dev.offline = memory->state == MEM_OFFLINE;
- error = device_register(&memory->dev);
- return error;
+ return device_register(&memory->dev);
}
static int init_memory_block(struct memory_block **memory,
@@ -582,67 +582,72 @@ static int init_memory_block(struct memory_block **memory,
{
struct memory_block *mem;
unsigned long start_pfn;
- int scn_nr;
- int ret = 0;
+ int scn_nr, ret, memblock_id;
+ *memory = NULL;
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem)
return -ENOMEM;
scn_nr = __section_nr(section);
+ memblock_id = base_memory_block_id(scn_nr);
mem->start_section_nr =
base_memory_block_id(scn_nr) * sections_per_block;
mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
mem->state = state;
- mem->section_count++;
mutex_init(&mem->state_mutex);
start_pfn = section_nr_to_pfn(mem->start_section_nr);
mem->phys_device = arch_get_memory_phys_device(start_pfn);
ret = register_memory(mem);
+ if (ret) {
+ kfree(mem);
+ return ret;
+ }
*memory = mem;
- return ret;
+ return 0;
}
-static int add_memory_section(int nid, struct mem_section *section,
- struct memory_block **mem_p,
- unsigned long state, enum mem_add_context context)
+static int add_memory_block(int base_section_nr)
{
- struct memory_block *mem = NULL;
- int scn_nr = __section_nr(section);
- int ret = 0;
+ struct mem_section *section = __nr_to_section(base_section_nr);
+ struct memory_block *mem;
+ int i, ret = 0, present_sections = 0, memblock_id;
mutex_lock(&mem_sysfs_mutex);
- if (context == BOOT) {
- /* same memory block ? */
- if (mem_p && *mem_p)
- if (scn_nr >= (*mem_p)->start_section_nr &&
- scn_nr <= (*mem_p)->end_section_nr) {
- mem = *mem_p;
- kobject_get(&mem->dev.kobj);
- }
- } else
- mem = find_memory_block(section);
+ memblock_id = base_memory_block_id(base_section_nr);
+ if (WARN_ON_ONCE(!test_bit(memblock_id, memblock_present))) {
+ /* tried to add a non-present memory block, shouldn't happen */
+ ret = -EINVAL;
+ goto out;
+ }
- if (mem) {
- mem->section_count++;
- kobject_put(&mem->dev.kobj);
- } else {
- ret = init_memory_block(&mem, section, state);
- /* store memory_block pointer for next loop */
- if (!ret && context == BOOT)
- if (mem_p)
- *mem_p = mem;
+ /* count present sections */
+ for (i = base_section_nr;
+ (i < base_section_nr + sections_per_block) && i < NR_MEM_SECTIONS;
+ i++) {
+ if (present_section_nr(i))
+ present_sections++;
}
- if (!ret) {
- if (context == HOTPLUG &&
- mem->section_count == sections_per_block)
- ret = register_mem_sect_under_node(mem, nid);
+ if (WARN_ON_ONCE(present_sections == 0)) {
+ /*
+ * No present sections in memory block marked as present,
+ * shouldn't happen. If it does, correct the present bitfield
+ * and return error.
+ */
+ clear_bit(memblock_id, memblock_present);
+ ret = -EINVAL;
+ goto out;
}
+ ret = init_memory_block(&mem, section, MEM_ONLINE);
+ if (ret)
+ goto out;
+ mem->section_count = present_sections;
+out:
mutex_unlock(&mem_sysfs_mutex);
return ret;
}
@@ -653,7 +658,40 @@ static int add_memory_section(int nid, struct mem_section *section,
*/
int register_new_memory(int nid, struct mem_section *section)
{
- return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
+ int ret = 0, memblock_id;
+ struct memory_block *mem;
+
+ mutex_lock(&mem_sysfs_mutex);
+
+ memblock_id = base_memory_block_id(__section_nr(section));
+
+ /*
+ * Set present bit for the block if adding the first present
+ * section in the block.
+ */
+ if (!test_bit(memblock_id, memblock_present))
+ set_bit(memblock_id, memblock_present);
+
+ /* refs the memory_block dev if found */
+ mem = find_memory_block(section);
+
+ if (!mem) {
+ /* create offline memory block */
+ ret = init_memory_block(&mem, section, MEM_OFFLINE);
+ if (ret)
+ goto out;
+ kobject_get(&mem->dev.kobj);
+ }
+ mem->section_count++;
+ kobject_put(&mem->dev.kobj);
+
+ /* only register blocks with all sections present? */
+ if (mem->section_count == sections_per_block)
+ ret = register_mem_sect_under_node(mem, nid);
+
+out:
+ mutex_lock(&mem_sysfs_mutex);
+ return ret;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -671,15 +709,18 @@ static int remove_memory_block(unsigned long node_id,
struct mem_section *section, int phys_device)
{
struct memory_block *mem;
+ int memblock_id;
mutex_lock(&mem_sysfs_mutex);
mem = find_memory_block(section);
unregister_mem_sect_under_nodes(mem, __section_nr(section));
mem->section_count--;
- if (mem->section_count == 0)
+ if (mem->section_count == 0) {
unregister_memory(mem);
- else
+ memblock_id = base_memory_block_id(__section_nr(section));
+ clear_bit(memblock_id, memblock_present);
+ } else
kobject_put(&mem->dev.kobj);
mutex_unlock(&mem_sysfs_mutex);
@@ -701,6 +742,60 @@ bool is_memblock_offlined(struct memory_block *mem)
return mem->state == MEM_OFFLINE;
}
+static ssize_t memory_show_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long memblock_id;
+ int ret;
+
+ if (!largememory_enable)
+ /*
+ * If !largememory_enable then the memblock is sure to
+ * exist already because it was created at boot time by
+ * memory_dev_init()
+ */
+ return 0;
+
+ if (kstrtoul(buf, 10, &memblock_id))
+ return -EINVAL;
+
+ if (memblock_id > base_memory_block_id(NR_MEM_SECTIONS))
+ return -EINVAL;
+
+ if (!test_bit(memblock_id, memblock_present))
+ return -EINVAL;
+
+ dev = subsys_find_device_by_id(&memory_subsys, memblock_id, NULL);
+ if (dev)
+ return 0;
+
+ ret = add_memory_block(memblock_id * sections_per_block);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR(show, S_IWUSR, NULL, memory_show_store);
+
+static ssize_t memory_present_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n_bits, ret;
+
+ n_bits = NR_MEM_SECTIONS / sections_per_block;
+ ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2,
+ memblock_present, n_bits);
+ buf[ret++] = '\n';
+ buf[ret] = '\0';
+
+ return ret;
+}
+
+static DEVICE_ATTR(present, S_IRUSR | S_IRGRP | S_IROTH,
+ memory_present_show, NULL);
+
static struct attribute *memory_root_attrs[] = {
#ifdef CONFIG_ARCH_MEMORY_PROBE
&dev_attr_probe.attr,
@@ -712,6 +807,8 @@ static struct attribute *memory_root_attrs[] = {
#endif
&dev_attr_block_size_bytes.attr,
+ &dev_attr_show.attr,
+ &dev_attr_present.attr,
NULL
};
@@ -724,16 +821,48 @@ static const struct attribute_group *memory_root_attr_groups[] = {
NULL,
};
+static int __init largememory_select(char *notused)
+{
+ largememory_enable = 1;
+ return 1;
+}
+__setup("largememory", largememory_select);
+
+static int __init init_memblock_present(int bitfield_size)
+{
+ int i, j;
+
+ /* allocate bitfield for monitoring present memory blocks */
+ memblock_present = kzalloc(bitfield_size, GFP_KERNEL);
+ if (!memblock_present)
+ return -ENOMEM;
+
+ /* for each block */
+ for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
+ /* for each section in the block */
+ for (j = i; j < i + sections_per_block; j++) {
+ /*
+ * If last least one section is present in a block,
+ * then the block is considered present.
+ */
+ if (present_section_nr(i)) {
+ set_bit(base_memory_block_id(i),
+ memblock_present);
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
/*
* Initialize the sysfs support for memory devices...
*/
int __init memory_dev_init(void)
{
- unsigned int i;
- int ret;
- int err;
+ int ret, nr_memblks, bitfield_size, memblock_id;
unsigned long block_sz;
- struct memory_block *mem = NULL;
ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
if (ret)
@@ -742,22 +871,21 @@ int __init memory_dev_init(void)
block_sz = get_memory_block_size();
sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
- /*
- * Create entries for memory sections that were found
- * during boot and have been initialized
- */
- for (i = 0; i < NR_MEM_SECTIONS; i++) {
- if (!present_section_nr(i))
- continue;
- /* don't need to reuse memory_block if only one per block */
- err = add_memory_section(0, __nr_to_section(i),
- (sections_per_block == 1) ? NULL : &mem,
- MEM_ONLINE,
- BOOT);
- if (!ret)
- ret = err;
- }
+ nr_memblks = NR_MEM_SECTIONS / sections_per_block;
+ bitfield_size = BITS_TO_LONGS(nr_memblks) * sizeof(unsigned long);
+ ret = init_memblock_present(bitfield_size);
+ if (ret)
+ goto out;
+
+ if (!largememory_enable) {
+ /*
+ * Create entries for memory sections that were found
+ * during boot and have been initialized
+ */
+ for_each_set_bit(memblock_id, memblock_present, bitfield_size)
+ add_memory_block(memblock_id * sections_per_block);
+ }
out:
if (ret)
printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 85c31a8..4c89fb0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -125,7 +125,6 @@ extern struct memory_block *find_memory_block_hinted(struct mem_section *,
struct memory_block *);
extern struct memory_block *find_memory_block(struct mem_section *);
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
-enum mem_add_context { BOOT, HOTPLUG };
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
#ifdef CONFIG_MEMORY_HOTPLUG
--
1.8.3.4
On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> Large memory systems (~1TB or more) experience boot delays on the order
> of minutes due to the initializing the memory configuration part of
> sysfs at /sys/devices/system/memory/.
>
> ppc64 has a normal memory block size of 256M (however sometimes as low
> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> entries per block that's around 80k items that need be created at boot
> time in sysfs. Some systems go up to 16TB where the issue is even more
> severe.
>
> This patch provides a means by which users can prevent the creation of
> the memory block attributes at boot time, yet still dynamically create
> them if they are needed.
>
> This patch creates a new boot parameter, "largememory" that will prevent
> memory_dev_init() from creating all of the memory block sysfs attributes
> at boot time. Instead, a new root attribute "show" will allow
> the dynamic creation of the memory block devices.
> Another new root attribute "present" shows the memory blocks present in
> the system; the valid inputs for the "show" attribute.
Ick, no new boot parameters please, that's just a mess for distros and
users.
How about tying this into the work that has been happening on lkml with
booting large-memory systems faster? The work there should solve the
problems you are seeing here (i.e. add memory after booting). It looks
like this is the same issue you are having here, just in a different
part of the kernel.
thanks,
greg k-h
On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> Large memory systems (~1TB or more) experience boot delays on the order
> of minutes due to the initializing the memory configuration part of
> sysfs at /sys/devices/system/memory/.
Are you sure that is the problem area? Have you run perf on it?
> ppc64 has a normal memory block size of 256M (however sometimes as low
> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> entries per block that's around 80k items that need be created at boot
> time in sysfs. Some systems go up to 16TB where the issue is even more
> severe.
The x86 developers are working with larger memory sizes and they haven't
seen the problem in this area, for them it's in other places, as I
referred to in my other email.
> This patch provides a means by which users can prevent the creation of
> the memory block attributes at boot time, yet still dynamically create
> them if they are needed.
>
> This patch creates a new boot parameter, "largememory" that will prevent
> memory_dev_init() from creating all of the memory block sysfs attributes
> at boot time. Instead, a new root attribute "show" will allow
> the dynamic creation of the memory block devices.
> Another new root attribute "present" shows the memory blocks present in
> the system; the valid inputs for the "show" attribute.
You never documented any of these abi changes, which is a requirement
(not that I'm agreeing that a boot parameter is ok...)
> There was a significant amount of refactoring to allow for this but
> IMHO, the code is much easier to understand now.
Care to refactor things first, with no logical changes, and then make
your changes in a follow-on patch, so that people can actually find what
you changed in the patch?
Remember, a series of patches please, not one big "refactor and change
it all" patch.
thanks,
greg k-h
On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
> On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
>> ppc64 has a normal memory block size of 256M (however sometimes as low
>> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
>> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
>> entries per block that's around 80k items that need be created at boot
>> time in sysfs. Some systems go up to 16TB where the issue is even more
>> severe.
>
> The x86 developers are working with larger memory sizes and they haven't
> seen the problem in this area, for them it's in other places, as I
> referred to in my other email.
The SGI guys don't run normal distro kernels and don't turn on memory
hotplug, so they don't see this. I do the same in my testing of
large-memory x86 systems to speed up my boots. I'll go stick it back in
there and see if I can generate some numbers for a 1TB machine.
But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
the SECTION_SIZE is so 8x bigger by default.
Also, the cost of creating sections on ppc is *MUCH* higher than x86
when amortized across the number of pages that you're initializing. A
section on ppc64 has to be created for each (2^24/2^16)=256 pages while
one on x86 is created for each (2^27/2^12)=32768 pages.
Thus, x86 folks with our small pages and large sections tend to be
focused on per-page costs. The ppc folks with their small sections and
larger pages tend to be focused on the per-section costs.
On Wed, Aug 14, 2013 at 01:05:33PM -0700, Dave Hansen wrote:
> On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
> > On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> >> ppc64 has a normal memory block size of 256M (however sometimes as low
> >> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> >> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> >> entries per block that's around 80k items that need be created at boot
> >> time in sysfs. Some systems go up to 16TB where the issue is even more
> >> severe.
> >
> > The x86 developers are working with larger memory sizes and they haven't
> > seen the problem in this area, for them it's in other places, as I
> > referred to in my other email.
>
> The SGI guys don't run normal distro kernels and don't turn on memory
> hotplug, so they don't see this. I do the same in my testing of
> large-memory x86 systems to speed up my boots. I'll go stick it back in
> there and see if I can generate some numbers for a 1TB machine.
>
> But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
> the SECTION_SIZE is so 8x bigger by default.
>
> Also, the cost of creating sections on ppc is *MUCH* higher than x86
> when amortized across the number of pages that you're initializing. A
> section on ppc64 has to be created for each (2^24/2^16)=256 pages while
> one on x86 is created for each (2^27/2^12)=32768 pages.
>
> Thus, x86 folks with our small pages and large sections tend to be
> focused on per-page costs. The ppc folks with their small sections and
> larger pages tend to be focused on the per-section costs.
Ah, thanks for the explaination, now it makes more sense why they are
both optimizing in different places.
But a "cleanup" patch first, and then the "change the logic to go
faster" would be better here, so that we can review what is really
happening.
thanks,
greg k-h
On 08/14/2013 02:31 PM, Seth Jennings wrote:
> Large memory systems (~1TB or more) experience boot delays on the order
> of minutes due to the initializing the memory configuration part of
> sysfs at /sys/devices/system/memory/.
With the previous work that has been done in the memory sysfs layout
I think you need machines with 8 or 16+ TB of memory to see boot delays
that are measured in minutes. The boot delay is there, and with larger
memory systems in he future it will only get worse.
>
> ppc64 has a normal memory block size of 256M (however sometimes as low
> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> entries per block that's around 80k items that need be created at boot
> time in sysfs. Some systems go up to 16TB where the issue is even more
> severe.
>
It should also be pointed out that the number of sysfs entries created on
16+ TB system is 100k+. At his scale it is not really human readable to
list all of the entries. The amount of resources used to create all of the
uderlying structures for each of the entries starts to add up also.
I think an approach such as this makes the sysfs memory layout more
human readable and saves on resources.
-Nathan
On 08/14/2013 12:31 PM, Seth Jennings wrote:
> There was a significant amount of refactoring to allow for this but
> IMHO, the code is much easier to understand now.
...
> drivers/base/memory.c | 248 +++++++++++++++++++++++++++++++++++++------------
> include/linux/memory.h | 1 -
> 2 files changed, 188 insertions(+), 61 deletions(-)
Adding 120 lines of code made it easier to understand? ;)
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 2b7813e..392ccd3 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -30,7 +30,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
>
> #define MEMORY_CLASS_NAME "memory"
>
> -static int sections_per_block;
> +static int sections_per_block __read_mostly;
>
> static inline int base_memory_block_id(int section_nr)
> {
> @@ -47,6 +47,9 @@ static struct bus_type memory_subsys = {
> .offline = memory_subsys_offline,
> };
>
> +static unsigned long *memblock_present;
> +static bool largememory_enable __read_mostly;
How would you see this getting used in practice? Are you just going to
set this by default on ppc? Or, would you ask the distros to put it on
the command-line by default? Would it only affect machines larger than
a certain size?
This approach breaks the ABI, right?. An existing tool would not work
with this patch (plus boot option) since it would not know how to
show/hide things. It lets _part_ of those existing tools get reused
since they only have to be taught how to show/hide things.
I'd find this really intriguing if you found a way to keep even the old
tools working. Instead of having an explicit show/hide, why couldn't
you just create the entries on open(), for instance?
> int register_memory_notifier(struct notifier_block *nb)
> @@ -565,16 +568,13 @@ static const struct attribute_group *memory_memblk_attr_groups[] = {
> static
> int register_memory(struct memory_block *memory)
> {
> - int error;
> -
> memory->dev.bus = &memory_subsys;
> memory->dev.id = memory->start_section_nr / sections_per_block;
> memory->dev.release = memory_block_release;
> memory->dev.groups = memory_memblk_attr_groups;
> memory->dev.offline = memory->state == MEM_OFFLINE;
>
> - error = device_register(&memory->dev);
> - return error;
> + return device_register(&memory->dev);
> }
This kind of simplification could surely stand in its own patch.
> static int init_memory_block(struct memory_block **memory,
> @@ -582,67 +582,72 @@ static int init_memory_block(struct memory_block **memory,
> {
> struct memory_block *mem;
> unsigned long start_pfn;
> - int scn_nr;
> - int ret = 0;
> + int scn_nr, ret, memblock_id;
>
> + *memory = NULL;
> mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> if (!mem)
> return -ENOMEM;
>
> scn_nr = __section_nr(section);
> + memblock_id = base_memory_block_id(scn_nr);
> mem->start_section_nr =
> base_memory_block_id(scn_nr) * sections_per_block;
> mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
> mem->state = state;
> - mem->section_count++;
> mutex_init(&mem->state_mutex);
> start_pfn = section_nr_to_pfn(mem->start_section_nr);
> mem->phys_device = arch_get_memory_phys_device(start_pfn);
>
> ret = register_memory(mem);
> + if (ret) {
> + kfree(mem);
> + return ret;
> + }
>
> *memory = mem;
> - return ret;
> + return 0;
> }
memblock_id doesn't appear to ever get used. This also appears to
change the conventions about where the 'memory_block' is allocated and
freed. It isn't immediately clear why it needed to be changed.
Looking at the rest, this _really_ needs to get refactored before it's
reviewable.
> +static ssize_t memory_present_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + int n_bits, ret;
> +
> + n_bits = NR_MEM_SECTIONS / sections_per_block;
> + ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2,
> + memblock_present, n_bits);
> + buf[ret++] = '\n';
> + buf[ret] = '\0';
> +
> + return ret;
> +}
Doesn't this break the one-value-per-file rule?
On Wed, Aug 14, 2013 at 01:47:27PM -0700, Dave Hansen wrote:
> On 08/14/2013 12:31 PM, Seth Jennings wrote:
> > There was a significant amount of refactoring to allow for this but
> > IMHO, the code is much easier to understand now.
> ...
> > drivers/base/memory.c | 248 +++++++++++++++++++++++++++++++++++++------------
> > include/linux/memory.h | 1 -
> > 2 files changed, 188 insertions(+), 61 deletions(-)
>
> Adding 120 lines of code made it easier to understand? ;)
Currently, the memory block abstraction is bolted onto the section
layout pretty loosely. The add_memory_section() function is
particularly bad as the memory block is passed from call to call from
memory_dev_init() and there was a lot of logic surrounding, does the
memory block already exist and ifs about whether we were adding it at
boot or the result of hotplug.
Also because we were releasing the mem_sysfs_mutex after each section
add we were having to do a get/put on the dev.kobj every time. That
isn't even actually done properly right now as the memory block is not
looked up under lock each time a ref is taken. This hasn't been a
problem since that is only done during boot when memory blocks can't
be concurrently unregistered. But still. It speaks to the complexity.
This patch introduces add_memory_block() which helps with the delineation
of sections and blocks and makes the code easier to understand IMHO.
>
> > diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> > index 2b7813e..392ccd3 100644
> > --- a/drivers/base/memory.c
> > +++ b/drivers/base/memory.c
> > @@ -30,7 +30,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
> >
> > #define MEMORY_CLASS_NAME "memory"
> >
> > -static int sections_per_block;
> > +static int sections_per_block __read_mostly;
> >
> > static inline int base_memory_block_id(int section_nr)
> > {
> > @@ -47,6 +47,9 @@ static struct bus_type memory_subsys = {
> > .offline = memory_subsys_offline,
> > };
> >
> > +static unsigned long *memblock_present;
> > +static bool largememory_enable __read_mostly;
>
> How would you see this getting used in practice? Are you just going to
> set this by default on ppc? Or, would you ask the distros to put it on
> the command-line by default? Would it only affect machines larger than
> a certain size?
It would not be on by default, but for people running into the problem
on their large memory machines, we could enable this after verifying
that any tools that operate on the memory block configs are "dynamic
memory block aware"
>
> This approach breaks the ABI, right?.
ABI... API... it does modify an expectation of current userspace tools.
> An existing tool would not work
> with this patch (plus boot option) since it would not know how to
> show/hide things. It lets _part_ of those existing tools get reused
> since they only have to be taught how to show/hide things.
>
> I'd find this really intriguing if you found a way to keep even the old
> tools working. Instead of having an explicit show/hide, why couldn't
> you just create the entries on open(), for instance?
Nathan and I talked about this and I'm not sure if sysfs would support
such a thing, i.e. memory block creation when someone tried to cd into
the memory block device config. I wouldn't know where to start on that.
>
> > int register_memory_notifier(struct notifier_block *nb)
> > @@ -565,16 +568,13 @@ static const struct attribute_group *memory_memblk_attr_groups[] = {
> > static
> > int register_memory(struct memory_block *memory)
> > {
> > - int error;
> > -
> > memory->dev.bus = &memory_subsys;
> > memory->dev.id = memory->start_section_nr / sections_per_block;
> > memory->dev.release = memory_block_release;
> > memory->dev.groups = memory_memblk_attr_groups;
> > memory->dev.offline = memory->state == MEM_OFFLINE;
> >
> > - error = device_register(&memory->dev);
> > - return error;
> > + return device_register(&memory->dev);
> > }
>
> This kind of simplification could surely stand in its own patch.
Yes.
>
> > static int init_memory_block(struct memory_block **memory,
> > @@ -582,67 +582,72 @@ static int init_memory_block(struct memory_block **memory,
> > {
> > struct memory_block *mem;
> > unsigned long start_pfn;
> > - int scn_nr;
> > - int ret = 0;
> > + int scn_nr, ret, memblock_id;
> >
> > + *memory = NULL;
> > mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> > if (!mem)
> > return -ENOMEM;
> >
> > scn_nr = __section_nr(section);
> > + memblock_id = base_memory_block_id(scn_nr);
> > mem->start_section_nr =
> > base_memory_block_id(scn_nr) * sections_per_block;
> > mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
> > mem->state = state;
> > - mem->section_count++;
> > mutex_init(&mem->state_mutex);
> > start_pfn = section_nr_to_pfn(mem->start_section_nr);
> > mem->phys_device = arch_get_memory_phys_device(start_pfn);
> >
> > ret = register_memory(mem);
> > + if (ret) {
> > + kfree(mem);
> > + return ret;
> > + }
> >
> > *memory = mem;
> > - return ret;
> > + return 0;
> > }
>
> memblock_id doesn't appear to ever get used. This also appears to
> change the conventions about where the 'memory_block' is allocated and
> freed. It isn't immediately clear why it needed to be changed.
>
> Looking at the rest, this _really_ needs to get refactored before it's
> reviewable.
Yes, this doesn't review well in diff format, unfortunately. I guess
I'll need to break this down into steps that rewrite the same code
blocks from step to step.
>
> > +static ssize_t memory_present_show(struct device *dev,
> > + struct device_attribute *attr, char *buf)
> > +{
> > + int n_bits, ret;
> > +
> > + n_bits = NR_MEM_SECTIONS / sections_per_block;
> > + ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2,
> > + memblock_present, n_bits);
> > + buf[ret++] = '\n';
> > + buf[ret] = '\0';
> > +
> > + return ret;
> > +}
>
> Doesn't this break the one-value-per-file rule?
I didn't know there was such a rule but it might. Is there any
acceptable way to express a ranges of values. I would just do a
"last_memblock_id" but the range can have holes.
Seth
On Wed, Aug 14, 2013 at 01:35:46PM -0700, Greg Kroah-Hartman wrote:
> On Wed, Aug 14, 2013 at 01:05:33PM -0700, Dave Hansen wrote:
> > On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
> > > On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> > >> ppc64 has a normal memory block size of 256M (however sometimes as low
> > >> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> > >> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> > >> entries per block that's around 80k items that need be created at boot
> > >> time in sysfs. Some systems go up to 16TB where the issue is even more
> > >> severe.
> > >
> > > The x86 developers are working with larger memory sizes and they haven't
> > > seen the problem in this area, for them it's in other places, as I
> > > referred to in my other email.
> >
> > The SGI guys don't run normal distro kernels and don't turn on memory
> > hotplug, so they don't see this. I do the same in my testing of
> > large-memory x86 systems to speed up my boots. I'll go stick it back in
> > there and see if I can generate some numbers for a 1TB machine.
> >
> > But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
> > the SECTION_SIZE is so 8x bigger by default.
> >
> > Also, the cost of creating sections on ppc is *MUCH* higher than x86
> > when amortized across the number of pages that you're initializing. A
> > section on ppc64 has to be created for each (2^24/2^16)=256 pages while
> > one on x86 is created for each (2^27/2^12)=32768 pages.
> >
> > Thus, x86 folks with our small pages and large sections tend to be
> > focused on per-page costs. The ppc folks with their small sections and
> > larger pages tend to be focused on the per-section costs.
>
> Ah, thanks for the explaination, now it makes more sense why they are
> both optimizing in different places.
Yes, thanks Dave for explaining that for me :)
>
> But a "cleanup" patch first, and then the "change the logic to go
> faster" would be better here, so that we can review what is really
> happening.
Will do.
>
> thanks,
>
> greg k-h
>
On 08/14/2013 02:14 PM, Seth Jennings wrote:
> On Wed, Aug 14, 2013 at 01:47:27PM -0700, Dave Hansen wrote:
>> On 08/14/2013 12:31 PM, Seth Jennings wrote:
>>> +static unsigned long *memblock_present;
>>> +static bool largememory_enable __read_mostly;
>>
>> How would you see this getting used in practice? Are you just going to
>> set this by default on ppc? Or, would you ask the distros to put it on
>> the command-line by default? Would it only affect machines larger than
>> a certain size?
>
> It would not be on by default, but for people running into the problem
> on their large memory machines, we could enable this after verifying
> that any tools that operate on the memory block configs are "dynamic
> memory block aware"
I don't have any idea how you would do this in practice. You can
obviously fix the dlpar tools that you're shipping for a given distro.
But, what about the other applications? I could imagine things like
databases wanting to know when memory comes and goes.
>> An existing tool would not work
>> with this patch (plus boot option) since it would not know how to
>> show/hide things. It lets _part_ of those existing tools get reused
>> since they only have to be taught how to show/hide things.
>>
>> I'd find this really intriguing if you found a way to keep even the old
>> tools working. Instead of having an explicit show/hide, why couldn't
>> you just create the entries on open(), for instance?
>
> Nathan and I talked about this and I'm not sure if sysfs would support
> such a thing, i.e. memory block creation when someone tried to cd into
> the memory block device config. I wouldn't know where to start on that.
It's not that fundamentally hard. Think of how an on-disk filesystem
works today. You do an open('foo') and the fs goes off and tries to
figure out whether there's something named 'foo' on the disk. If there
is, it creates inodes and dentries to back it. In your case, instead of
going to the disk, you go look at the memory configuration.
This might require a new filesystem instead of sysfs itself, but it
would potentially be a way to have good backward compatibility.
>>> +static ssize_t memory_present_show(struct device *dev,
>>> + struct device_attribute *attr, char *buf)
>>> +{
>>> + int n_bits, ret;
>>> +
>>> + n_bits = NR_MEM_SECTIONS / sections_per_block;
>>> + ret = bitmap_scnlistprintf(buf, PAGE_SIZE - 2,
>>> + memblock_present, n_bits);
>>> + buf[ret++] = '\n';
>>> + buf[ret] = '\0';
>>> +
>>> + return ret;
>>> +}
>>
>> Doesn't this break the one-value-per-file rule?
>
> I didn't know there was such a rule but it might. Is there any
> acceptable way to express a ranges of values. I would just do a
> "last_memblock_id" but the range can have holes.
The rules are written down very nicely:
Documentation/filesystems/sysfs.txt
I'm wrong, btw.... It's acceptable to do 'arrays' of values too, not
just single ones.
On 08/14/2013 02:14 PM, Seth Jennings wrote:
>> >An existing tool would not work
>> >with this patch (plus boot option) since it would not know how to
>> >show/hide things. It lets_part_ of those existing tools get reused
>> >since they only have to be taught how to show/hide things.
>> >
>> >I'd find this really intriguing if you found a way to keep even the old
>> >tools working. Instead of having an explicit show/hide, why couldn't
>> >you just create the entries on open(), for instance?
> Nathan and I talked about this and I'm not sure if sysfs would support
> such a thing, i.e. memory block creation when someone tried to cd into
> the memory block device config. I wouldn't know where to start on that.
>
Also, I'd expect userspace tools might use readdir() to find out what
memory blocks a system has (unless they just stat("memory0"),
stat("memory1")...). I don't think filesystem tricks (at least within
sysfs) are going to let this magically be solved without breaking the
userspace API.
On Wed, Aug 14, 2013 at 1:35 PM, Greg Kroah-Hartman
<[email protected]> wrote:
> On Wed, Aug 14, 2013 at 01:05:33PM -0700, Dave Hansen wrote:
>> On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
>> > On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
>> >> ppc64 has a normal memory block size of 256M (however sometimes as low
>> >> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
>> >> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
>> >> entries per block that's around 80k items that need be created at boot
>> >> time in sysfs. Some systems go up to 16TB where the issue is even more
>> >> severe.
>> >
>> > The x86 developers are working with larger memory sizes and they haven't
>> > seen the problem in this area, for them it's in other places, as I
>> > referred to in my other email.
>>
>> The SGI guys don't run normal distro kernels and don't turn on memory
>> hotplug, so they don't see this. I do the same in my testing of
>> large-memory x86 systems to speed up my boots. I'll go stick it back in
>> there and see if I can generate some numbers for a 1TB machine.
>>
>> But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
>> the SECTION_SIZE is so 8x bigger by default.
>>
>> Also, the cost of creating sections on ppc is *MUCH* higher than x86
>> when amortized across the number of pages that you're initializing. A
>> section on ppc64 has to be created for each (2^24/2^16)=256 pages while
>> one on x86 is created for each (2^27/2^12)=32768 pages.
>>
>> Thus, x86 folks with our small pages and large sections tend to be
>> focused on per-page costs. The ppc folks with their small sections and
>> larger pages tend to be focused on the per-section costs.
>
> Ah, thanks for the explaination, now it makes more sense why they are
> both optimizing in different places.
I had one local patch that sent before, it will probe block size for
generic x86_64.
set it to 2G looks more reasonable for system with 1T+ ram.
Also can we add block_size in that /sys directly so could generate
less entries ?
Thanks
Yinghai
On 08/14/2013 02:37 PM, Cody P Schafer wrote:
> Also, I'd expect userspace tools might use readdir() to find out what
> memory blocks a system has (unless they just stat("memory0"),
> stat("memory1")...). I don't think filesystem tricks (at least within
> sysfs) are going to let this magically be solved without breaking the
> userspace API.
sysfs files are probably a bit too tied to kobjects to make this work
easily in practice. It would probably need to be a new filesystem, imnho.
But, there's nothing to keep you from creating dentries for all of the
memory blocks if someone _does_ a readdir(). It'll suck, of course, but
it's at least compatible with what's there. You could also 'chmod -x'
it to make it more obvious that folks shouldn't be poking around in
there, although it won't keep them from ls'ing. If you're concerned
about resource consumption, we could also just make the directory
unreadable to everyone but root.
On Wed, Aug 14, 2013 at 02:37:26PM -0700, Yinghai Lu wrote:
> On Wed, Aug 14, 2013 at 1:35 PM, Greg Kroah-Hartman
> <[email protected]> wrote:
> > On Wed, Aug 14, 2013 at 01:05:33PM -0700, Dave Hansen wrote:
> >> On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
> >> > On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> >> >> ppc64 has a normal memory block size of 256M (however sometimes as low
> >> >> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> >> >> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> >> >> entries per block that's around 80k items that need be created at boot
> >> >> time in sysfs. Some systems go up to 16TB where the issue is even more
> >> >> severe.
> >> >
> >> > The x86 developers are working with larger memory sizes and they haven't
> >> > seen the problem in this area, for them it's in other places, as I
> >> > referred to in my other email.
> >>
> >> The SGI guys don't run normal distro kernels and don't turn on memory
> >> hotplug, so they don't see this. I do the same in my testing of
> >> large-memory x86 systems to speed up my boots. I'll go stick it back in
> >> there and see if I can generate some numbers for a 1TB machine.
> >>
> >> But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
> >> the SECTION_SIZE is so 8x bigger by default.
> >>
> >> Also, the cost of creating sections on ppc is *MUCH* higher than x86
> >> when amortized across the number of pages that you're initializing. A
> >> section on ppc64 has to be created for each (2^24/2^16)=256 pages while
> >> one on x86 is created for each (2^27/2^12)=32768 pages.
> >>
> >> Thus, x86 folks with our small pages and large sections tend to be
> >> focused on per-page costs. The ppc folks with their small sections and
> >> larger pages tend to be focused on the per-section costs.
> >
> > Ah, thanks for the explaination, now it makes more sense why they are
> > both optimizing in different places.
>
> I had one local patch that sent before, it will probe block size for
> generic x86_64.
> set it to 2G looks more reasonable for system with 1T+ ram.
If I am understanding you correctly, you are suggesting we make the block size
a boot time tunable. It can't be a runtime tunable since the memory blocks are
currently created a boot time.
On ppc64, we can't just just choose a memory block size since it must align
with the underlying LMB (logical memory block) size, set in the hardware ahead
of time.
Seth
On Wed, Aug 14, 2013 at 2:52 PM, Seth Jennings
<[email protected]> wrote:
> On Wed, Aug 14, 2013 at 02:37:26PM -0700, Yinghai Lu wrote:
> If I am understanding you correctly, you are suggesting we make the block size
> a boot time tunable. It can't be a runtime tunable since the memory blocks are
> currently created a boot time.
yes.
If could make it to be tunable at run-time, could be much better.
>
> On ppc64, we can't just just choose a memory block size since it must align
> with the underlying LMB (logical memory block) size, set in the hardware ahead
> of time.
assume for x86_64, it now support 46bits physical address. so if we
change to 2G, then
big system will only need create (1<<15) aka 32k entries in /sys at most.
Thanks
Yinghai
On Wednesday, August 14, 2013 02:31:45 PM Seth Jennings wrote:
> Large memory systems (~1TB or more) experience boot delays on the order
> of minutes due to the initializing the memory configuration part of
> sysfs at /sys/devices/system/memory/.
>
> ppc64 has a normal memory block size of 256M (however sometimes as low
> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> entries per block that's around 80k items that need be created at boot
> time in sysfs. Some systems go up to 16TB where the issue is even more
> severe.
>
> This patch provides a means by which users can prevent the creation of
> the memory block attributes at boot time, yet still dynamically create
> them if they are needed.
>
> This patch creates a new boot parameter, "largememory" that will prevent
> memory_dev_init() from creating all of the memory block sysfs attributes
> at boot time. Instead, a new root attribute "show" will allow
> the dynamic creation of the memory block devices.
> Another new root attribute "present" shows the memory blocks present in
> the system; the valid inputs for the "show" attribute.
I wonder how this is going to work with the ACPI device object binding to
memory blocks that's in 3.11-rc.
That stuff will only work if the memory blocks are already there when
acpi_memory_enable_device() runs and that is called from the ACPI namespace
scanning code executed (1) during boot and (2) during hotplug. So I don't
think you can just create them on the fly at run time as a result of a
sysfs write.
Thanks,
Rafael
On Wed, Aug 14, 2013 at 04:52:53PM -0500, Seth Jennings wrote:
> On Wed, Aug 14, 2013 at 02:37:26PM -0700, Yinghai Lu wrote:
> > On Wed, Aug 14, 2013 at 1:35 PM, Greg Kroah-Hartman
> > <[email protected]> wrote:
> > > On Wed, Aug 14, 2013 at 01:05:33PM -0700, Dave Hansen wrote:
> > >> On 08/14/2013 12:43 PM, Greg Kroah-Hartman wrote:
> > >> > On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> > >> >> ppc64 has a normal memory block size of 256M (however sometimes as low
> > >> >> as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> > >> >> 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> > >> >> entries per block that's around 80k items that need be created at boot
> > >> >> time in sysfs. Some systems go up to 16TB where the issue is even more
> > >> >> severe.
> > >> >
> > >> > The x86 developers are working with larger memory sizes and they haven't
> > >> > seen the problem in this area, for them it's in other places, as I
> > >> > referred to in my other email.
> > >>
> > >> The SGI guys don't run normal distro kernels and don't turn on memory
> > >> hotplug, so they don't see this. I do the same in my testing of
> > >> large-memory x86 systems to speed up my boots. I'll go stick it back in
> > >> there and see if I can generate some numbers for a 1TB machine.
> > >>
> > >> But, the problem on x86 is at _worst_ 1/8 of the problem on ppc64 since
> > >> the SECTION_SIZE is so 8x bigger by default.
> > >>
> > >> Also, the cost of creating sections on ppc is *MUCH* higher than x86
> > >> when amortized across the number of pages that you're initializing. A
> > >> section on ppc64 has to be created for each (2^24/2^16)=256 pages while
> > >> one on x86 is created for each (2^27/2^12)=32768 pages.
> > >>
> > >> Thus, x86 folks with our small pages and large sections tend to be
> > >> focused on per-page costs. The ppc folks with their small sections and
> > >> larger pages tend to be focused on the per-section costs.
> > >
> > > Ah, thanks for the explaination, now it makes more sense why they are
> > > both optimizing in different places.
> >
> > I had one local patch that sent before, it will probe block size for
> > generic x86_64.
> > set it to 2G looks more reasonable for system with 1T+ ram.
>
> If I am understanding you correctly, you are suggesting we make the block size
> a boot time tunable. It can't be a runtime tunable since the memory blocks are
> currently created a boot time.
>
> On ppc64, we can't just just choose a memory block size since it must align
> with the underlying LMB (logical memory block) size, set in the hardware ahead
> of time.
As long as the Linux block size is a multiple of the LMB size it should
be possible. You'd just have to plug/unplug multiple LMBs at once.
It would be possible to construct an LMB layout that defeats that, eg.
every 2nd LMB not present, but I don't think that's actually a concern
in practice.
cheers
On Thu, Aug 15, 2013 at 02:01:09AM +0200, Rafael J. Wysocki wrote:
> On Wednesday, August 14, 2013 02:31:45 PM Seth Jennings wrote:
> > Large memory systems (~1TB or more) experience boot delays on the order
> > of minutes due to the initializing the memory configuration part of
> > sysfs at /sys/devices/system/memory/.
> >
> > ppc64 has a normal memory block size of 256M (however sometimes as low
> > as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> > 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> > entries per block that's around 80k items that need be created at boot
> > time in sysfs. Some systems go up to 16TB where the issue is even more
> > severe.
> >
> > This patch provides a means by which users can prevent the creation of
> > the memory block attributes at boot time, yet still dynamically create
> > them if they are needed.
> >
> > This patch creates a new boot parameter, "largememory" that will prevent
> > memory_dev_init() from creating all of the memory block sysfs attributes
> > at boot time. Instead, a new root attribute "show" will allow
> > the dynamic creation of the memory block devices.
> > Another new root attribute "present" shows the memory blocks present in
> > the system; the valid inputs for the "show" attribute.
>
> I wonder how this is going to work with the ACPI device object binding to
> memory blocks that's in 3.11-rc.
Thanks for pointing this out. Yes the walking of the memory blocks in
this code will present a problem for the dynamic creation of memory
blocks :/ Sounds like there are some other challenges (backward
compatibility, no boot paramater) that I'll have to look at as well.
Seth
>
> That stuff will only work if the memory blocks are already there when
> acpi_memory_enable_device() runs and that is called from the ACPI namespace
> scanning code executed (1) during boot and (2) during hotplug. So I don't
> think you can just create them on the fly at run time as a result of a
> sysfs write.
>
> Thanks,
> Rafael
>
On Wed, Aug 14, 2013 at 12:40:43PM -0700, Greg Kroah-Hartman wrote:
> On Wed, Aug 14, 2013 at 02:31:45PM -0500, Seth Jennings wrote:
> > Large memory systems (~1TB or more) experience boot delays on the order
> > of minutes due to the initializing the memory configuration part of
> > sysfs at /sys/devices/system/memory/.
> >
> > ppc64 has a normal memory block size of 256M (however sometimes as low
> > as 16M depending on the system LMB size), and (I think) x86 is 128M. With
> > 1TB of RAM and a 256M block size, that's 4k memory blocks with 20 sysfs
> > entries per block that's around 80k items that need be created at boot
> > time in sysfs. Some systems go up to 16TB where the issue is even more
> > severe.
> >
> > This patch provides a means by which users can prevent the creation of
> > the memory block attributes at boot time, yet still dynamically create
> > them if they are needed.
> >
> > This patch creates a new boot parameter, "largememory" that will prevent
> > memory_dev_init() from creating all of the memory block sysfs attributes
> > at boot time. Instead, a new root attribute "show" will allow
> > the dynamic creation of the memory block devices.
> > Another new root attribute "present" shows the memory blocks present in
> > the system; the valid inputs for the "show" attribute.
>
> Ick, no new boot parameters please, that's just a mess for distros and
> users.
Yes, I agreed it isn't the best. The reason for it is backward
compatibility; or rather the user saying "I knowingly forfeit backward
compatibility in favor of fast boot time and all my userspace tools are
aware of the new requirement to show memory blocks before trying to use
them".
The only suggestion I heard that would make full backward compatibility
possible is one from Dave to create a new filesystem for memory blocks
(not sysfs) where the memory block directories would be dynamically
created as programs tried to access/open them. But you'd still have the
issue of requiring user intervention to mount that "memoryfs" at
/sys/devices/system/memory (or whatever your sysfs mount point was).
So it's tricky.
>
> How about tying this into the work that has been happening on lkml with
> booting large-memory systems faster? The work there should solve the
> problems you are seeing here (i.e. add memory after booting). It looks
> like this is the same issue you are having here, just in a different
> part of the kernel.
I assume you are referring to the "[RFC v3 0/5] Transparent on-demand
struct page initialization embedded in the buddy allocator" thread. I
think that trying to solve a different problem than I am trying to solve
though. IIUC, that patch series is deferring the initialization of the
actually memory pages.
I'm working on breaking out just the refactoring patches (no functional
change) into a reviewable patch series. Thanks for your time looking at
this!
Seth