When hot-removing memory release_mem_region_adjustable() splits
iomem resources if they are not the exact size of the memory being
hot-deleted. Adding this memory back to the kernel adds a new
resource.
Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
When we hot-add the memory back we now have three resources:
0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
Now if we try to remove a section of memory that overlaps these resources,
like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
expects the chunk of memory to be within the boundaries of a single
resource.
This patch adds a function request_resource_and_merge(). This is called
instead of request_resource_conflict() when registering a resource in
add_memory(). It calls request_resource_conflict() and if hot-removing is
enabled (if it isn't we won't get resource fragmentation) we attempt to
merge contiguous resources on the node.
Signed-off-by: Rashmica Gupta <[email protected]>
---
v1->v2: Only attempt to merge resources if hot-remove is enabled.
include/linux/ioport.h | 2 +
include/linux/memory_hotplug.h | 2 +-
kernel/resource.c | 116 +++++++++++++++++++++++++++++++++++++++++
mm/memory_hotplug.c | 22 ++++----
4 files changed, 130 insertions(+), 12 deletions(-)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..f5b93a711e86 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -189,6 +189,8 @@ extern int allocate_resource(struct resource *root, struct resource *new,
resource_size_t,
resource_size_t),
void *alignf_data);
+extern struct resource *request_resource_and_merge(struct resource *parent,
+ struct resource *new, int nid);
struct resource *lookup_resource(struct resource *root, resource_size_t start);
int adjust_resource(struct resource *res, resource_size_t start,
resource_size_t size);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4e9828cda7a2..9c00f97c8cc6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -322,7 +322,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
void *arg, int (*func)(struct memory_block *, void *));
extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource, bool online);
+extern int add_memory_resource(int nid, u64 start, u64 size, bool online);
extern int arch_add_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap, bool want_memblock);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..5967061f9cea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1621,3 +1621,119 @@ static int __init strict_iomem(char *str)
}
__setup("iomem=", strict_iomem);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * Attempt to merge resource and it's sibling
+ */
+static int merge_resources(struct resource *res)
+{
+ struct resource *next;
+ struct resource *tmp;
+ uint64_t size;
+ int ret = -EINVAL;
+
+ next = res->sibling;
+
+ /*
+ * Not sure how to handle two different children. So only attempt
+ * to merge two resources if neither have children, only one has a
+ * child or if both have the same child.
+ */
+ if ((res->child && next->child) && (res->child != next->child))
+ return ret;
+
+ if (res->end + 1 != next->start)
+ return ret;
+
+ if (res->flags != next->flags)
+ return ret;
+
+ /* Update sibling and child of resource */
+ res->sibling = next->sibling;
+ tmp = res->child;
+ if (!res->child)
+ res->child = next->child;
+
+ size = next->end - res->start + 1;
+ ret = __adjust_resource(res, res->start, size);
+ if (ret) {
+ /* Failed so restore resource to original state */
+ res->sibling = next;
+ res->child = tmp;
+ return ret;
+ }
+
+ free_resource(next);
+
+ return ret;
+}
+
+/*
+ * Attempt to merge resources on the node
+ */
+static void merge_node_resources(int nid, struct resource *parent)
+{
+ struct resource *res;
+ uint64_t start_addr;
+ uint64_t end_addr;
+ int ret;
+
+ start_addr = node_start_pfn(nid) << PAGE_SHIFT;
+ end_addr = node_end_pfn(nid) << PAGE_SHIFT;
+
+ write_lock(&resource_lock);
+
+ /* Get the first resource */
+ res = parent->child;
+
+ while (res) {
+ /* Check that the resource is within the node */
+ if (res->start < start_addr) {
+ res = res->sibling;
+ continue;
+ }
+ /* Exit if resource is past end of node */
+ if (res->sibling->end > end_addr)
+ break;
+
+ ret = merge_resources(res);
+ if (!ret)
+ continue;
+ res = res->sibling;
+ }
+ write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+/**
+ * request_resource_and_merge() - request an I/O or memory resource for hot-add
+ * @parent: parent resource descriptor
+ * @new: resource descriptor desired by caller
+ * @nid: node id of the node we want the resource on
+ *
+ * Returns NULL for success and conflict resource on error.
+ * If no conflict resource then attempt to merge resources on the node.
+ *
+ * This is intended to cleanup the fragmentation of resources that occurs when
+ * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
+ * not enabled then there is no point trying to merge resources.
+ */
+struct resource *request_resource_and_merge(struct resource *parent,
+ struct resource *new, int nid)
+{
+ struct resource *conflict;
+
+ conflict = request_resource_conflict(parent, new);
+
+ if (conflict)
+ return conflict;
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ merge_node_resources(nid, parent);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+ return NULL;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..2e342f5ce322 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -97,7 +97,7 @@ void mem_hotplug_done(void)
}
/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(int nid, u64 start, u64 size)
{
struct resource *res, *conflict;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
@@ -108,7 +108,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->start = start;
res->end = start + size - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- conflict = request_resource_conflict(&iomem_resource, res);
+ conflict = request_resource_and_merge(&iomem_resource, res, nid);
if (conflict) {
if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
pr_debug("Device unaddressable memory block "
@@ -122,11 +122,15 @@ static struct resource *register_memory_resource(u64 start, u64 size)
return res;
}
-static void release_memory_resource(struct resource *res)
+static void release_memory_resource(struct resource *res, u64 start, u64 size)
{
if (!res)
return;
+#ifdef CONFIG_MEMORY_HOTREMOVE
+ release_mem_region_adjustable(&iomem_resource, start, size);
+#else
release_resource(res);
+#endif
kfree(res);
return;
}
@@ -1096,17 +1100,13 @@ static int online_memory_block(struct memory_block *mem, void *arg)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
{
- u64 start, size;
pg_data_t *pgdat = NULL;
bool new_pgdat;
bool new_node;
int ret;
- start = res->start;
- size = resource_size(res);
-
ret = check_hotplug_memory_range(start, size);
if (ret)
return ret;
@@ -1195,13 +1195,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- res = register_memory_resource(start, size);
+ res = register_memory_resource(nid, start, size);
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res, memhp_auto_online);
+ ret = add_memory_resource(nid, start, size, memhp_auto_online);
if (ret < 0)
- release_memory_resource(res);
+ release_memory_resource(res, start, size);
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
--
2.14.4
On Mon, Aug 06, 2018 at 04:52:24PM +1000, Rashmica Gupta wrote:
> When hot-removing memory release_mem_region_adjustable() splits
> iomem resources if they are not the exact size of the memory being
> hot-deleted. Adding this memory back to the kernel adds a new
> resource.
>
> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>
> When we hot-add the memory back we now have three resources:
> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>
> Now if we try to remove a section of memory that overlaps these resources,
> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
> expects the chunk of memory to be within the boundaries of a single
> resource.
>
> This patch adds a function request_resource_and_merge(). This is called
> instead of request_resource_conflict() when registering a resource in
> add_memory(). It calls request_resource_conflict() and if hot-removing is
> enabled (if it isn't we won't get resource fragmentation) we attempt to
> merge contiguous resources on the node.
>
> Signed-off-by: Rashmica Gupta <[email protected]>
> ---
> v1->v2: Only attempt to merge resources if hot-remove is enabled.
>
> include/linux/ioport.h | 2 +
> include/linux/memory_hotplug.h | 2 +-
> kernel/resource.c | 116 +++++++++++++++++++++++++++++++++++++++++
> mm/memory_hotplug.c | 22 ++++----
> 4 files changed, 130 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index da0ebaec25f0..f5b93a711e86 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -189,6 +189,8 @@ extern int allocate_resource(struct resource *root, struct resource *new,
> resource_size_t,
> resource_size_t),
> void *alignf_data);
> +extern struct resource *request_resource_and_merge(struct resource *parent,
> + struct resource *new, int nid);
> struct resource *lookup_resource(struct resource *root, resource_size_t start);
> int adjust_resource(struct resource *res, resource_size_t start,
> resource_size_t size);
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 4e9828cda7a2..9c00f97c8cc6 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -322,7 +322,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
> extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
> void *arg, int (*func)(struct memory_block *, void *));
> extern int add_memory(int nid, u64 start, u64 size);
> -extern int add_memory_resource(int nid, struct resource *resource, bool online);
> +extern int add_memory_resource(int nid, u64 start, u64 size, bool online);
The add_memory_resource() function is also used by Xen balloon. It should
be updated as well.
> extern int arch_add_memory(int nid, u64 start, u64 size,
> struct vmem_altmap *altmap, bool want_memblock);
> extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 30e1bc68503b..5967061f9cea 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -1621,3 +1621,119 @@ static int __init strict_iomem(char *str)
> }
>
> __setup("iomem=", strict_iomem);
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +/*
> + * Attempt to merge resource and it's sibling
> + */
> +static int merge_resources(struct resource *res)
> +{
> + struct resource *next;
> + struct resource *tmp;
> + uint64_t size;
> + int ret = -EINVAL;
> +
> + next = res->sibling;
> +
> + /*
> + * Not sure how to handle two different children. So only attempt
> + * to merge two resources if neither have children, only one has a
> + * child or if both have the same child.
> + */
> + if ((res->child && next->child) && (res->child != next->child))
> + return ret;
> +
> + if (res->end + 1 != next->start)
> + return ret;
> +
> + if (res->flags != next->flags)
> + return ret;
> +
> + /* Update sibling and child of resource */
> + res->sibling = next->sibling;
> + tmp = res->child;
> + if (!res->child)
> + res->child = next->child;
> +
> + size = next->end - res->start + 1;
> + ret = __adjust_resource(res, res->start, size);
> + if (ret) {
> + /* Failed so restore resource to original state */
> + res->sibling = next;
> + res->child = tmp;
> + return ret;
> + }
> +
> + free_resource(next);
> +
> + return ret;
> +}
> +
> +/*
> + * Attempt to merge resources on the node
> + */
> +static void merge_node_resources(int nid, struct resource *parent)
> +{
> + struct resource *res;
> + uint64_t start_addr;
> + uint64_t end_addr;
> + int ret;
> +
> + start_addr = node_start_pfn(nid) << PAGE_SHIFT;
> + end_addr = node_end_pfn(nid) << PAGE_SHIFT;
> +
> + write_lock(&resource_lock);
> +
> + /* Get the first resource */
> + res = parent->child;
> +
> + while (res) {
> + /* Check that the resource is within the node */
> + if (res->start < start_addr) {
> + res = res->sibling;
> + continue;
> + }
> + /* Exit if resource is past end of node */
> + if (res->sibling->end > end_addr)
> + break;
> +
> + ret = merge_resources(res);
> + if (!ret)
> + continue;
> + res = res->sibling;
> + }
> + write_unlock(&resource_lock);
> +}
> +#endif /* CONFIG_MEMORY_HOTREMOVE */
> +
> +/**
> + * request_resource_and_merge() - request an I/O or memory resource for hot-add
> + * @parent: parent resource descriptor
> + * @new: resource descriptor desired by caller
> + * @nid: node id of the node we want the resource on
> + *
> + * Returns NULL for success and conflict resource on error.
> + * If no conflict resource then attempt to merge resources on the node.
Please put the return value description at the end and start it with
'Return:' rather than 'Returns'
> + *
> + * This is intended to cleanup the fragmentation of resources that occurs when
> + * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
> + * not enabled then there is no point trying to merge resources.
I think it's worth noting that inability to merge the resources is not an
error.
> + */
> +struct resource *request_resource_and_merge(struct resource *parent,
> + struct resource *new, int nid)
> +{
> + struct resource *conflict;
> +
> + conflict = request_resource_conflict(parent, new);
> +
> + if (conflict)
> + return conflict;
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> + merge_node_resources(nid, parent);
> +#endif /* CONFIG_MEMORY_HOTREMOVE */
> +
> + return NULL;
> +}
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 7deb49f69e27..2e342f5ce322 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -97,7 +97,7 @@ void mem_hotplug_done(void)
> }
>
> /* add this memory to iomem resource */
> -static struct resource *register_memory_resource(u64 start, u64 size)
> +static struct resource *register_memory_resource(int nid, u64 start, u64 size)
> {
> struct resource *res, *conflict;
> res = kzalloc(sizeof(struct resource), GFP_KERNEL);
> @@ -108,7 +108,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
> res->start = start;
> res->end = start + size - 1;
> res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> - conflict = request_resource_conflict(&iomem_resource, res);
> + conflict = request_resource_and_merge(&iomem_resource, res, nid);
> if (conflict) {
> if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
> pr_debug("Device unaddressable memory block "
> @@ -122,11 +122,15 @@ static struct resource *register_memory_resource(u64 start, u64 size)
> return res;
> }
>
> -static void release_memory_resource(struct resource *res)
> +static void release_memory_resource(struct resource *res, u64 start, u64 size)
> {
> if (!res)
> return;
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> + release_mem_region_adjustable(&iomem_resource, start, size);
> +#else
> release_resource(res);
> +#endif
> kfree(res);
> return;
> }
> @@ -1096,17 +1100,13 @@ static int online_memory_block(struct memory_block *mem, void *arg)
> }
>
> /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
> -int __ref add_memory_resource(int nid, struct resource *res, bool online)
> +int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
> {
> - u64 start, size;
> pg_data_t *pgdat = NULL;
> bool new_pgdat;
> bool new_node;
> int ret;
>
> - start = res->start;
> - size = resource_size(res);
> -
> ret = check_hotplug_memory_range(start, size);
> if (ret)
> return ret;
> @@ -1195,13 +1195,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
> struct resource *res;
> int ret;
>
> - res = register_memory_resource(start, size);
> + res = register_memory_resource(nid, start, size);
> if (IS_ERR(res))
> return PTR_ERR(res);
>
> - ret = add_memory_resource(nid, res, memhp_auto_online);
> + ret = add_memory_resource(nid, start, size, memhp_auto_online);
> if (ret < 0)
> - release_memory_resource(res);
> + release_memory_resource(res, start, size);
> return ret;
> }
> EXPORT_SYMBOL_GPL(add_memory);
> --
> 2.14.4
>
--
Sincerely yours,
Mike.
On 07/08/18 00:14, Mike Rapoport wrote:
> On Mon, Aug 06, 2018 at 04:52:24PM +1000, Rashmica Gupta wrote:
>> When hot-removing memory release_mem_region_adjustable() splits
>> iomem resources if they are not the exact size of the memory being
>> hot-deleted. Adding this memory back to the kernel adds a new
>> resource.
>>
>> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
>> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
>> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>>
>> When we hot-add the memory back we now have three resources:
>> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>>
>> Now if we try to remove a section of memory that overlaps these resources,
>> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
>> expects the chunk of memory to be within the boundaries of a single
>> resource.
>>
>> This patch adds a function request_resource_and_merge(). This is called
>> instead of request_resource_conflict() when registering a resource in
>> add_memory(). It calls request_resource_conflict() and if hot-removing is
>> enabled (if it isn't we won't get resource fragmentation) we attempt to
>> merge contiguous resources on the node.
>>
>> Signed-off-by: Rashmica Gupta <[email protected]>
>> ---
>> v1->v2: Only attempt to merge resources if hot-remove is enabled.
>>
>> include/linux/ioport.h | 2 +
>> include/linux/memory_hotplug.h | 2 +-
>> kernel/resource.c | 116 +++++++++++++++++++++++++++++++++++++++++
>> mm/memory_hotplug.c | 22 ++++----
>> 4 files changed, 130 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>> index da0ebaec25f0..f5b93a711e86 100644
>> --- a/include/linux/ioport.h
>> +++ b/include/linux/ioport.h
>> @@ -189,6 +189,8 @@ extern int allocate_resource(struct resource *root, struct resource *new,
>> resource_size_t,
>> resource_size_t),
>> void *alignf_data);
>> +extern struct resource *request_resource_and_merge(struct resource *parent,
>> + struct resource *new, int nid);
>> struct resource *lookup_resource(struct resource *root, resource_size_t start);
>> int adjust_resource(struct resource *res, resource_size_t start,
>> resource_size_t size);
>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>> index 4e9828cda7a2..9c00f97c8cc6 100644
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -322,7 +322,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
>> extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
>> void *arg, int (*func)(struct memory_block *, void *));
>> extern int add_memory(int nid, u64 start, u64 size);
>> -extern int add_memory_resource(int nid, struct resource *resource, bool online);
>> +extern int add_memory_resource(int nid, u64 start, u64 size, bool online);
> The add_memory_resource() function is also used by Xen balloon. It should
> be updated as well.
Thanks for spotting that.
>> extern int arch_add_memory(int nid, u64 start, u64 size,
>> struct vmem_altmap *altmap, bool want_memblock);
>> extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>> diff --git a/kernel/resource.c b/kernel/resource.c
>> index 30e1bc68503b..5967061f9cea 100644
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
>> @@ -1621,3 +1621,119 @@ static int __init strict_iomem(char *str)
>> }
>>
>> __setup("iomem=", strict_iomem);
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +/*
>> + * Attempt to merge resource and it's sibling
>> + */
>> +static int merge_resources(struct resource *res)
>> +{
>> + struct resource *next;
>> + struct resource *tmp;
>> + uint64_t size;
>> + int ret = -EINVAL;
>> +
>> + next = res->sibling;
>> +
>> + /*
>> + * Not sure how to handle two different children. So only attempt
>> + * to merge two resources if neither have children, only one has a
>> + * child or if both have the same child.
>> + */
>> + if ((res->child && next->child) && (res->child != next->child))
>> + return ret;
>> +
>> + if (res->end + 1 != next->start)
>> + return ret;
>> +
>> + if (res->flags != next->flags)
>> + return ret;
>> +
>> + /* Update sibling and child of resource */
>> + res->sibling = next->sibling;
>> + tmp = res->child;
>> + if (!res->child)
>> + res->child = next->child;
>> +
>> + size = next->end - res->start + 1;
>> + ret = __adjust_resource(res, res->start, size);
>> + if (ret) {
>> + /* Failed so restore resource to original state */
>> + res->sibling = next;
>> + res->child = tmp;
>> + return ret;
>> + }
>> +
>> + free_resource(next);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Attempt to merge resources on the node
>> + */
>> +static void merge_node_resources(int nid, struct resource *parent)
>> +{
>> + struct resource *res;
>> + uint64_t start_addr;
>> + uint64_t end_addr;
>> + int ret;
>> +
>> + start_addr = node_start_pfn(nid) << PAGE_SHIFT;
>> + end_addr = node_end_pfn(nid) << PAGE_SHIFT;
>> +
>> + write_lock(&resource_lock);
>> +
>> + /* Get the first resource */
>> + res = parent->child;
>> +
>> + while (res) {
>> + /* Check that the resource is within the node */
>> + if (res->start < start_addr) {
>> + res = res->sibling;
>> + continue;
>> + }
>> + /* Exit if resource is past end of node */
>> + if (res->sibling->end > end_addr)
>> + break;
>> +
>> + ret = merge_resources(res);
>> + if (!ret)
>> + continue;
>> + res = res->sibling;
>> + }
>> + write_unlock(&resource_lock);
>> +}
>> +#endif /* CONFIG_MEMORY_HOTREMOVE */
>> +
>> +/**
>> + * request_resource_and_merge() - request an I/O or memory resource for hot-add
>> + * @parent: parent resource descriptor
>> + * @new: resource descriptor desired by caller
>> + * @nid: node id of the node we want the resource on
>> + *
>> + * Returns NULL for success and conflict resource on error.
>> + * If no conflict resource then attempt to merge resources on the node.
> Please put the return value description at the end and start it with
> 'Return:' rather than 'Returns'
Will do.
>> + *
>> + * This is intended to cleanup the fragmentation of resources that occurs when
>> + * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
>> + * not enabled then there is no point trying to merge resources.
> I think it's worth noting that inability to merge the resources is not an
> error.
Good point.
>> + */
>> +struct resource *request_resource_and_merge(struct resource *parent,
>> + struct resource *new, int nid)
>> +{
>> + struct resource *conflict;
>> +
>> + conflict = request_resource_conflict(parent, new);
>> +
>> + if (conflict)
>> + return conflict;
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> + merge_node_resources(nid, parent);
>> +#endif /* CONFIG_MEMORY_HOTREMOVE */
>> +
>> + return NULL;
>> +}
>> +#endif /* CONFIG_MEMORY_HOTPLUG */
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 7deb49f69e27..2e342f5ce322 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -97,7 +97,7 @@ void mem_hotplug_done(void)
>> }
>>
>> /* add this memory to iomem resource */
>> -static struct resource *register_memory_resource(u64 start, u64 size)
>> +static struct resource *register_memory_resource(int nid, u64 start, u64 size)
>> {
>> struct resource *res, *conflict;
>> res = kzalloc(sizeof(struct resource), GFP_KERNEL);
>> @@ -108,7 +108,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>> res->start = start;
>> res->end = start + size - 1;
>> res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> - conflict = request_resource_conflict(&iomem_resource, res);
>> + conflict = request_resource_and_merge(&iomem_resource, res, nid);
>> if (conflict) {
>> if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
>> pr_debug("Device unaddressable memory block "
>> @@ -122,11 +122,15 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>> return res;
>> }
>>
>> -static void release_memory_resource(struct resource *res)
>> +static void release_memory_resource(struct resource *res, u64 start, u64 size)
>> {
>> if (!res)
>> return;
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> + release_mem_region_adjustable(&iomem_resource, start, size);
>> +#else
>> release_resource(res);
>> +#endif
>> kfree(res);
>> return;
>> }
>> @@ -1096,17 +1100,13 @@ static int online_memory_block(struct memory_block *mem, void *arg)
>> }
>>
>> /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
>> -int __ref add_memory_resource(int nid, struct resource *res, bool online)
>> +int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
>> {
>> - u64 start, size;
>> pg_data_t *pgdat = NULL;
>> bool new_pgdat;
>> bool new_node;
>> int ret;
>>
>> - start = res->start;
>> - size = resource_size(res);
>> -
>> ret = check_hotplug_memory_range(start, size);
>> if (ret)
>> return ret;
>> @@ -1195,13 +1195,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
>> struct resource *res;
>> int ret;
>>
>> - res = register_memory_resource(start, size);
>> + res = register_memory_resource(nid, start, size);
>> if (IS_ERR(res))
>> return PTR_ERR(res);
>>
>> - ret = add_memory_resource(nid, res, memhp_auto_online);
>> + ret = add_memory_resource(nid, start, size, memhp_auto_online);
>> if (ret < 0)
>> - release_memory_resource(res);
>> + release_memory_resource(res, start, size);
>> return ret;
>> }
>> EXPORT_SYMBOL_GPL(add_memory);
>> --
>> 2.14.4
>>
On 08/06/2018 08:52 AM, Rashmica Gupta wrote:
> When hot-removing memory release_mem_region_adjustable() splits
> iomem resources if they are not the exact size of the memory being
> hot-deleted. Adding this memory back to the kernel adds a new
> resource.
>
> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>
> When we hot-add the memory back we now have three resources:
> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>
> Now if we try to remove a section of memory that overlaps these resources,
> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
> expects the chunk of memory to be within the boundaries of a single
> resource.
Hi,
it's the first time I see the resource code, so I might be easily wrong.
How can it happen that the second remove is section aligned but the
first one not?
> This patch adds a function request_resource_and_merge(). This is called
> instead of request_resource_conflict() when registering a resource in
> add_memory(). It calls request_resource_conflict() and if hot-removing is
> enabled (if it isn't we won't get resource fragmentation) we attempt to
> merge contiguous resources on the node.
>
> Signed-off-by: Rashmica Gupta <[email protected]>
...
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
...
> +/*
> + * Attempt to merge resources on the node
> + */
> +static void merge_node_resources(int nid, struct resource *parent)
> +{
> + struct resource *res;
> + uint64_t start_addr;
> + uint64_t end_addr;
> + int ret;
> +
> + start_addr = node_start_pfn(nid) << PAGE_SHIFT;
> + end_addr = node_end_pfn(nid) << PAGE_SHIFT;
> +
> + write_lock(&resource_lock);
> +
> + /* Get the first resource */
> + res = parent->child;
> +
> + while (res) {
> + /* Check that the resource is within the node */
> + if (res->start < start_addr) {
> + res = res->sibling;
> + continue;
> + }
> + /* Exit if resource is past end of node */
> + if (res->sibling->end > end_addr)
> + break;
IIUC, resource end is closed, so adjacent resources's start is end+1.
But node_end_pfn is open, so the comparison above should use '>='
instead of '>'?
> +
> + ret = merge_resources(res);
> + if (!ret)
> + continue;
> + res = res->sibling;
Should this rather use next_resource() to merge at all levels of the
hierarchy? Although memory seems to be flat under &iomem_resource so it
would be just future-proofing.
Thanks,
Vlastimil
On Tue, Aug 7, 2018 at 9:52 PM, Vlastimil Babka <[email protected]> wrote:
> On 08/06/2018 08:52 AM, Rashmica Gupta wrote:
>> When hot-removing memory release_mem_region_adjustable() splits
>> iomem resources if they are not the exact size of the memory being
>> hot-deleted. Adding this memory back to the kernel adds a new
>> resource.
>>
>> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
>> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
>> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>>
>> When we hot-add the memory back we now have three resources:
>> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>>
>> Now if we try to remove a section of memory that overlaps these resources,
>> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
>> expects the chunk of memory to be within the boundaries of a single
>> resource.
>
> Hi,
>
> it's the first time I see the resource code, so I might be easily wrong.
> How can it happen that the second remove is section aligned but the
> first one not?
>
I probably shouldn't have used that word... When I said "a section of memory"
I really meant "a chunk of memory" or "some memory".
>> This patch adds a function request_resource_and_merge(). This is called
>> instead of request_resource_conflict() when registering a resource in
>> add_memory(). It calls request_resource_conflict() and if hot-removing is
>> enabled (if it isn't we won't get resource fragmentation) we attempt to
>> merge contiguous resources on the node.
>>
>> Signed-off-by: Rashmica Gupta <[email protected]>
> ...
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
> ...
>> +/*
>> + * Attempt to merge resources on the node
>> + */
>> +static void merge_node_resources(int nid, struct resource *parent)
>> +{
>> + struct resource *res;
>> + uint64_t start_addr;
>> + uint64_t end_addr;
>> + int ret;
>> +
>> + start_addr = node_start_pfn(nid) << PAGE_SHIFT;
>> + end_addr = node_end_pfn(nid) << PAGE_SHIFT;
>> +
>> + write_lock(&resource_lock);
>> +
>> + /* Get the first resource */
>> + res = parent->child;
>> +
>> + while (res) {
>> + /* Check that the resource is within the node */
>> + if (res->start < start_addr) {
>> + res = res->sibling;
>> + continue;
>> + }
>> + /* Exit if resource is past end of node */
>> + if (res->sibling->end > end_addr)
>> + break;
>
> IIUC, resource end is closed, so adjacent resources's start is end+1.
> But node_end_pfn is open, so the comparison above should use '>='
> instead of '>'?
You are right. Thanks for spotting that.
>
>> +
>> + ret = merge_resources(res);
>> + if (!ret)
>> + continue;
>> + res = res->sibling;
>
> Should this rather use next_resource() to merge at all levels of the
> hierarchy? Although memory seems to be flat under &iomem_resource so it
> would be just future-proofing.
I don't know enough about the hierarchy and layout of resources to comment on
this.
>
> Thanks,
> Vlastimil