order >= MAX_ORDER pages are only allocated at boot stage using the
bootmem allocator with the "hugepages=xxx" option. These pages are never
free after boot by default since it would be a one-way street(>= MAX_ORDER
pages cannot be allocated later), but if administrator confirm not to
use these gigantic pages any more, these pinned pages will waste memory
since other users can't grab free pages from gigantic hugetlb pool even
if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
shrink supporting. Administrator can enable knob exported in sysctl to
permit to shrink gigantic hugetlb pool.
Testcase:
boot: hugepagesz=1G hugepages=10
[root@localhost hugepages]# free -m
total used free shared buffers cached
Mem: 36269 10836 25432 0 11 288
-/+ buffers/cache: 10537 25732
Swap: 35999 0 35999
[root@localhost hugepages]# echo 0 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
-bash: echo: write error: Invalid argument
[root@localhost hugepages]# echo 1 > /proc/sys/vm/hugetlb_shrink_gigantic_pool
[root@localhost hugepages]# echo 0 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
[root@localhost hugepages]# free -m
total used free shared buffers cached
Mem: 36269 597 35672 0 11 288
-/+ buffers/cache: 297 35972
Swap: 35999 0 35999
Wanpeng Li (6):
introduce new sysctl knob which control gigantic page pools shrinking
update_and_free_page gigantic pages awareness
enable gigantic hugetlb page pools shrinking
use already exist huge_page_order() instead of h->order
remove redundant hugetlb_prefault
use already exist interface huge_page_shift
Documentation/sysctl/vm.txt | 13 +++++++
include/linux/hugetlb.h | 5 +--
kernel/sysctl.c | 7 ++++
mm/hugetlb.c | 83 +++++++++++++++++++++++++++++--------------
mm/internal.h | 1 +
mm/page_alloc.c | 2 +-
6 files changed, 82 insertions(+), 29 deletions(-)
--
1.7.10.4
hugetlb_prefault is not used by any users. This patch remove redundant
hugetlb_prefault.
Signed-off-by: Wanpeng Li <[email protected]>
---
include/linux/hugetlb.h | 2 --
1 file changed, 2 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b7e4106..813b265 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -57,7 +57,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page);
-int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(int, char *);
void hugetlb_show_meminfo(void);
@@ -113,7 +112,6 @@ static inline unsigned long hugetlb_total_pages(void)
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w) ({ BUG(); 0; })
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
-#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}
--
1.7.10.4
Use already exist interface huge_page_order() instead of h->order to get
huge page order.
Signed-off-by: Wanpeng Li <[email protected]>
---
mm/hugetlb.c | 36 +++++++++++++++++++-----------------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 328f140..0cae950 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -593,7 +593,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
struct page *p;
int order = huge_page_order(h);
- VM_BUG_ON(!hugetlb_shrink_gigantic_pool && h->order >= MAX_ORDER);
+ VM_BUG_ON(!hugetlb_shrink_gigantic_pool &&
+ huge_page_order(h) >= MAX_ORDER);
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
@@ -722,7 +723,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (h->order >= MAX_ORDER)
+ if (huge_page_order(h) >= MAX_ORDER)
return NULL;
page = alloc_pages_exact_node(nid,
@@ -876,7 +877,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
struct page *page;
unsigned int r_nid;
- if (h->order >= MAX_ORDER)
+ if (huge_page_order(h) >= MAX_ORDER)
return NULL;
/*
@@ -1071,7 +1072,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
- if (h->order >= MAX_ORDER)
+ if (huge_page_order(h) >= MAX_ORDER)
return;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1265,7 +1266,7 @@ static void __init gather_bootmem_prealloc(void)
#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, h->order);
+ prep_compound_huge_page(page, huge_page_order(h));
prep_new_huge_page(h, page, page_to_nid(page));
/*
* If we had gigantic hugepages allocated at boot time, we need
@@ -1273,8 +1274,8 @@ static void __init gather_bootmem_prealloc(void)
* fix confusing memory reports from free(1) and another
* side-effects, like CommitLimit going negative.
*/
- if (h->order > (MAX_ORDER - 1))
- totalram_pages += 1 << h->order;
+ if (huge_page_order(h) > (MAX_ORDER - 1))
+ totalram_pages += 1 << huge_page_order(h);
}
}
@@ -1283,7 +1284,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
unsigned long i;
for (i = 0; i < h->max_huge_pages; ++i) {
- if (h->order >= MAX_ORDER) {
+ if (huge_page_order(h) >= MAX_ORDER) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1299,7 +1300,7 @@ static void __init hugetlb_init_hstates(void)
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
- if (h->order < MAX_ORDER)
+ if (huge_page_order(h) < MAX_ORDER)
hugetlb_hstate_alloc_pages(h);
}
}
@@ -1333,7 +1334,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
{
int i;
- if (h->order >= MAX_ORDER)
+ if (huge_page_order(h) >= MAX_ORDER)
return;
for_each_node_mask(i, *nodes_allowed) {
@@ -1416,8 +1417,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (h->order >= MAX_ORDER && (!hugetlb_shrink_gigantic_pool ||
- count > persistent_huge_pages(h)))
+ if (huge_page_order(h) >= MAX_ORDER && (!hugetlb_shrink_gigantic_pool
+ || count > persistent_huge_pages(h)))
return h->max_huge_pages;
/*
@@ -1543,7 +1544,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
goto out;
h = kobj_to_hstate(kobj, &nid);
- if (h->order >= MAX_ORDER && !hugetlb_shrink_gigantic_pool) {
+ if (huge_page_order(h) >= MAX_ORDER && !hugetlb_shrink_gigantic_pool) {
err = -EINVAL;
goto out;
}
@@ -1626,7 +1627,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
- if (h->order >= MAX_ORDER)
+ if (huge_page_order(h) >= MAX_ORDER)
return -EINVAL;
err = strict_strtoul(buf, 10, &input);
@@ -2037,7 +2038,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
tmp = h->max_huge_pages;
- if (write && h->order >= MAX_ORDER && !hugetlb_shrink_gigantic_pool)
+ if (write && huge_page_order(h) >= MAX_ORDER &&
+ !hugetlb_shrink_gigantic_pool)
return -EINVAL;
table->data = &tmp;
@@ -2102,7 +2104,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
tmp = h->nr_overcommit_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && huge_page_order(h) >= MAX_ORDER)
return -EINVAL;
table->data = &tmp;
@@ -3093,7 +3095,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
- return pages << h->order;
+ return pages << huge_page_order(h);
}
int hugetlb_reserve_pages(struct inode *inode,
--
1.7.10.4
order >= MAX_ORDER pages can't be freed to buddy system directly, this patch
destroy the gigantic hugetlb page to normal order-0 pages and free them one
by one.
Signed-off-by: Wanpeng Li <[email protected]>
---
mm/hugetlb.c | 39 +++++++++++++++++++++++++++++----------
mm/internal.h | 1 +
mm/page_alloc.c | 2 +-
3 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4a0c270..eeaf6f2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -579,25 +579,44 @@ err:
return NULL;
}
+static inline clear_page_flag(struct page *page)
+{
+ page->flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
+}
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct page *p;
+ int order = huge_page_order(h);
- VM_BUG_ON(h->order >= MAX_ORDER);
+ VM_BUG_ON(!hugetlb_shrink_gigantic_pool && h->order >= MAX_ORDER);
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
- 1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1 << PG_writeback);
- }
- VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
- set_page_refcounted(page);
arch_release_hugepage(page);
- __free_pages(page, huge_page_order(h));
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
+
+ if (order < MAX_ORDER) {
+ for (i = 0; i < pages_per_huge_page(h); i++)
+ clear_page_flag(page+i);
+ set_page_refcounted(page);
+ __free_pages(page, huge_page_order(h));
+ } else {
+ int nr_pages = 1 << order;
+ destroy_compound_page(page, order);
+ set_compound_order(page, 0);
+ for (i = 0, p = page; i < nr_pages; i++,
+ p = mem_map_next(p, page, i)) {
+ clear_page_flag(p);
+ set_page_refcounted(p);
+ __free_pages(p, 0);
+ }
+ }
}
struct hstate *size_to_hstate(unsigned long size)
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0..a63a35f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -101,6 +101,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
*/
extern void __free_pages_bootmem(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
+extern int destroy_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1394c5a..0ea14ba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -367,7 +367,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
/* update __split_huge_page_refcount if you change this function */
-static int destroy_compound_page(struct page *page, unsigned long order)
+int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
--
1.7.10.4
Use already exist interface huge_page_shift instead of h->order + PAGE_SHIFT.
Signed-off-by: Wanpeng Li <[email protected]>
---
mm/hugetlb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0cae950..750ed8a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -320,7 +320,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
hstate = hstate_vma(vma);
- return 1UL << (hstate->order + PAGE_SHIFT);
+ return 1UL << huge_page_shift(hstate);
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
--
1.7.10.4
Enable gigantic hugetlb page pools shrinking.
Signed-off-by: Wanpeng Li <[email protected]>
---
mm/hugetlb.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeaf6f2..328f140 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1416,7 +1416,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (h->order >= MAX_ORDER)
+ if (h->order >= MAX_ORDER && (!hugetlb_shrink_gigantic_pool ||
+ count > persistent_huge_pages(h)))
return h->max_huge_pages;
/*
@@ -1542,7 +1543,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
goto out;
h = kobj_to_hstate(kobj, &nid);
- if (h->order >= MAX_ORDER) {
+ if (h->order >= MAX_ORDER && !hugetlb_shrink_gigantic_pool) {
err = -EINVAL;
goto out;
}
@@ -2036,7 +2037,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
tmp = h->max_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && h->order >= MAX_ORDER && !hugetlb_shrink_gigantic_pool)
return -EINVAL;
table->data = &tmp;
--
1.7.10.4
This patch introduces new sysctl knob to support gigantic hugetlb page
pools shrinking. The default value is 0 since gigantic page pools
aren't permitted shrinked by default, administrator can echo 1 to knob
to enable gigantic page pools shrinking after they confirm they won't
use them any more.
Signed-off-by: Wanpeng Li <[email protected]>
---
Documentation/sysctl/vm.txt | 13 +++++++++++++
include/linux/hugetlb.h | 3 +++
kernel/sysctl.c | 7 +++++++
mm/hugetlb.c | 9 +++++++++
4 files changed, 32 insertions(+)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 21ad181..3baf332 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -30,6 +30,7 @@ Currently, these files are in /proc/sys/vm:
- extfrag_threshold
- hugepages_treat_as_movable
- hugetlb_shm_group
+- hugetlb_shrink_gigantic_pool
- laptop_mode
- legacy_va_layout
- lowmem_reserve_ratio
@@ -211,6 +212,18 @@ shared memory segment using hugetlb page.
==============================================================
+hugetlb_shrink_gigantic_pool
+
+order >= MAX_ORDER pages are only allocated at boot stage using the bootmem
+allocator with the "hugepages=xxx" option. These pages are never free'd
+by default since it would be a one-way street(>= MAX_ORDER pages cannot
+be allocated later), but if administrator confirm not to use these gigantic
+pages any more, these pinned pages will waste memory since other users
+can't grab free pages from gigantic hugetlb pool even OOM. Administrator
+can enable this parameter to permit to shrink gigantic hugetlb pool
+
+==============================================================
+
laptop_mode
laptop_mode is a knob that controls "laptop mode". All the things that are
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a62df3..b7e4106 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -36,6 +36,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+int hugetlb_shrink_gigantic_pool_handler(struct ctl_table *,
+ int, void __user *, size_t *, loff_t *);
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
@@ -73,6 +75,7 @@ extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages;
+extern int hugetlb_shrink_gigantic_pool;
/* arch callbacks */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dadde5..25eb85f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1187,6 +1187,13 @@ static struct ctl_table vm_table[] = {
.extra1 = (void *)&hugetlb_zero,
.extra2 = (void *)&hugetlb_infinity,
},
+ {
+ .procname = "hugetlb_shrink_gigantic_pool",
+ .data = &hugetlb_shrink_gigantic_pool,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = hugetlb_shrink_gigantic_pool_handler,
+ },
#ifdef CONFIG_NUMA
{
.procname = "nr_hugepages_mempolicy",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bacdf38..4a0c270 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,6 +35,7 @@
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
+int hugetlb_shrink_gigantic_pool;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -671,6 +672,14 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
}
}
+int hugetlb_shrink_gigantic_pool_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_dointvec(table, write, buffer, length, ppos);
+ return 0;
+}
+
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
--
1.7.10.4
On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
> order >= MAX_ORDER pages are only allocated at boot stage using the
> bootmem allocator with the "hugepages=xxx" option. These pages are never
> free after boot by default since it would be a one-way street(>= MAX_ORDER
> pages cannot be allocated later), but if administrator confirm not to
> use these gigantic pages any more, these pinned pages will waste memory
> since other users can't grab free pages from gigantic hugetlb pool even
> if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> shrink supporting. Administrator can enable knob exported in sysctl to
> permit to shrink gigantic hugetlb pool.
I am not sure I see why the new knob is needed.
/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
an additional step to allow writing to the file doesn't make much sense
to me to be honest.
Support for shrinking gigantic huge pages makes some sense to me but I
would be interested in the real world example. GB pages are usually used
in very specific environments where the amount is usually well known.
I could imagine nr_hugepages_mempolicy would make more sense to free
pages from particular nodes so they could be offlined for example.
Does the patchset handles this as well?
> Testcase:
> boot: hugepagesz=1G hugepages=10
>
> [root@localhost hugepages]# free -m
> total used free shared buffers cached
> Mem: 36269 10836 25432 0 11 288
> -/+ buffers/cache: 10537 25732
> Swap: 35999 0 35999
> [root@localhost hugepages]# echo 0 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
> -bash: echo: write error: Invalid argument
> [root@localhost hugepages]# echo 1 > /proc/sys/vm/hugetlb_shrink_gigantic_pool
> [root@localhost hugepages]# echo 0 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
> [root@localhost hugepages]# free -m
> total used free shared buffers cached
> Mem: 36269 597 35672 0 11 288
> -/+ buffers/cache: 297 35972
> Swap: 35999 0 35999
>
> Wanpeng Li (6):
> introduce new sysctl knob which control gigantic page pools shrinking
> update_and_free_page gigantic pages awareness
> enable gigantic hugetlb page pools shrinking
> use already exist huge_page_order() instead of h->order
> remove redundant hugetlb_prefault
> use already exist interface huge_page_shift
>
> Documentation/sysctl/vm.txt | 13 +++++++
> include/linux/hugetlb.h | 5 +--
> kernel/sysctl.c | 7 ++++
> mm/hugetlb.c | 83 +++++++++++++++++++++++++++++--------------
> mm/internal.h | 1 +
> mm/page_alloc.c | 2 +-
> 6 files changed, 82 insertions(+), 29 deletions(-)
>
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Michal Hocko
SUSE Labs
On Thu 04-04-13 18:17:46, Michal Hocko wrote:
> On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
> > order >= MAX_ORDER pages are only allocated at boot stage using the
> > bootmem allocator with the "hugepages=xxx" option. These pages are never
> > free after boot by default since it would be a one-way street(>= MAX_ORDER
> > pages cannot be allocated later), but if administrator confirm not to
> > use these gigantic pages any more, these pinned pages will waste memory
> > since other users can't grab free pages from gigantic hugetlb pool even
> > if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> > shrink supporting. Administrator can enable knob exported in sysctl to
> > permit to shrink gigantic hugetlb pool.
>
> I am not sure I see why the new knob is needed.
> /sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
> an additional step to allow writing to the file doesn't make much sense
> to me to be honest.
>
> Support for shrinking gigantic huge pages makes some sense to me but I
> would be interested in the real world example. GB pages are usually used
> in very specific environments where the amount is usually well known.
>
> I could imagine nr_hugepages_mempolicy would make more sense to free
> pages from particular nodes so they could be offlined for example.
> Does the patchset handles this as well?
Ohh, I should have checked before asking. Both knobs use the same
hugetlb_sysctl_handler_common and unless there is something hardcoded in
the patches then it should be supproted.
--
Michal Hocko
SUSE Labs
On Fri 05-04-13 07:41:23, Wanpeng Li wrote:
> On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:
> >On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
> >> order >= MAX_ORDER pages are only allocated at boot stage using the
> >> bootmem allocator with the "hugepages=xxx" option. These pages are never
> >> free after boot by default since it would be a one-way street(>= MAX_ORDER
> >> pages cannot be allocated later), but if administrator confirm not to
> >> use these gigantic pages any more, these pinned pages will waste memory
> >> since other users can't grab free pages from gigantic hugetlb pool even
> >> if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> >> shrink supporting. Administrator can enable knob exported in sysctl to
> >> permit to shrink gigantic hugetlb pool.
> >
> >I am not sure I see why the new knob is needed.
> >/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
> >an additional step to allow writing to the file doesn't make much sense
> >to me to be honest.
> >
> >Support for shrinking gigantic huge pages makes some sense to me but I
> >would be interested in the real world example. GB pages are usually used
> >in very specific environments where the amount is usually well known.
>
> Gigantic huge pages in hugetlb means h->order >= MAX_ORDER instead of GB
> pages. ;-)
Yes, I am aware of that but the question remains the same (and
unanswered). What is the use case?
--
Michal Hocko
SUSE Labs
Hi Michal,
On 04/05/2013 04:12 PM, Michal Hocko wrote:
> On Fri 05-04-13 07:41:23, Wanpeng Li wrote:
>> On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:
>>> On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
>>>> order >= MAX_ORDER pages are only allocated at boot stage using the
>>>> bootmem allocator with the "hugepages=xxx" option. These pages are never
>>>> free after boot by default since it would be a one-way street(>= MAX_ORDER
>>>> pages cannot be allocated later), but if administrator confirm not to
>>>> use these gigantic pages any more, these pinned pages will waste memory
>>>> since other users can't grab free pages from gigantic hugetlb pool even
>>>> if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
>>>> shrink supporting. Administrator can enable knob exported in sysctl to
>>>> permit to shrink gigantic hugetlb pool.
>>> I am not sure I see why the new knob is needed.
>>> /sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
>>> an additional step to allow writing to the file doesn't make much sense
>>> to me to be honest.
>>>
>>> Support for shrinking gigantic huge pages makes some sense to me but I
>>> would be interested in the real world example. GB pages are usually used
>>> in very specific environments where the amount is usually well known.
>> Gigantic huge pages in hugetlb means h->order >= MAX_ORDER instead of GB
>> pages. ;-)
> Yes, I am aware of that but the question remains the same (and
> unanswered). What is the use case?
As patch description, "if administrator confirm not to use these
gigantic pages any more, these pinned pages will waste memory since
other users can't grab free pages from gigantic hugetlb pool even if OOM".
>
On Fri 05-04-13 16:27:59, Wanpeng Li wrote:
> On Fri, Apr 05, 2013 at 10:12:39AM +0200, Michal Hocko wrote:
> >On Fri 05-04-13 07:41:23, Wanpeng Li wrote:
> >> On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:
> >> >On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
> >> >> order >= MAX_ORDER pages are only allocated at boot stage using the
> >> >> bootmem allocator with the "hugepages=xxx" option. These pages are never
> >> >> free after boot by default since it would be a one-way street(>= MAX_ORDER
> >> >> pages cannot be allocated later), but if administrator confirm not to
> >> >> use these gigantic pages any more, these pinned pages will waste memory
> >> >> since other users can't grab free pages from gigantic hugetlb pool even
> >> >> if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> >> >> shrink supporting. Administrator can enable knob exported in sysctl to
> >> >> permit to shrink gigantic hugetlb pool.
> >> >
> >> >I am not sure I see why the new knob is needed.
> >> >/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
> >> >an additional step to allow writing to the file doesn't make much sense
> >> >to me to be honest.
> >> >
> >> >Support for shrinking gigantic huge pages makes some sense to me but I
> >> >would be interested in the real world example. GB pages are usually used
> >> >in very specific environments where the amount is usually well known.
> >>
> >> Gigantic huge pages in hugetlb means h->order >= MAX_ORDER instead of GB
> >> pages. ;-)
> >
> >Yes, I am aware of that but the question remains the same (and
> >unanswered). What is the use case?
>
> The use case I can figure out is when memory pressure is serious and gigantic
> huge pages pools still pin large number of free pages.
Then this is a configuration issue. I understand that reboot is lame way
to fix it but the gigantic pages usage is so specific that I would be
really surprise if this kind of problem would pop out. I would also find
surprising if those pages were unused.
So the only use case I can figure out ATM is a hotplug scenario (after
hugetlb migration patchset is ready) but even then I would find it more
useful for in kernel usage (read hotplug).
--
Michal Hocko
SUSE Labs
On Fri 05-04-13 16:54:44, Simon Jeons wrote:
> Hi Michal,
> On 04/05/2013 04:12 PM, Michal Hocko wrote:
> >On Fri 05-04-13 07:41:23, Wanpeng Li wrote:
> >>On Thu, Apr 04, 2013 at 06:17:46PM +0200, Michal Hocko wrote:
> >>>On Thu 04-04-13 17:09:08, Wanpeng Li wrote:
> >>>>order >= MAX_ORDER pages are only allocated at boot stage using the
> >>>>bootmem allocator with the "hugepages=xxx" option. These pages are never
> >>>>free after boot by default since it would be a one-way street(>= MAX_ORDER
> >>>>pages cannot be allocated later), but if administrator confirm not to
> >>>>use these gigantic pages any more, these pinned pages will waste memory
> >>>>since other users can't grab free pages from gigantic hugetlb pool even
> >>>>if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> >>>>shrink supporting. Administrator can enable knob exported in sysctl to
> >>>>permit to shrink gigantic hugetlb pool.
> >>>I am not sure I see why the new knob is needed.
> >>>/sys/kernel/mm/hugepages/hugepages-*/nr_hugepages is root interface so
> >>>an additional step to allow writing to the file doesn't make much sense
> >>>to me to be honest.
> >>>
> >>>Support for shrinking gigantic huge pages makes some sense to me but I
> >>>would be interested in the real world example. GB pages are usually used
> >>>in very specific environments where the amount is usually well known.
> >>Gigantic huge pages in hugetlb means h->order >= MAX_ORDER instead of GB
> >>pages. ;-)
> >Yes, I am aware of that but the question remains the same (and
> >unanswered). What is the use case?
>
> As patch description, "if administrator confirm not to use these
> gigantic pages any more, these pinned pages will waste memory since
> other users can't grab free pages from gigantic hugetlb pool even if
> OOM".
Is this a use case that we care about? How often something like that
happens? I understand this is "nice to have" but I am interested whether
somebody actually _needs_ this.
--
Michal Hocko
SUSE Labs
On Fri, Apr 12, 2013 at 07:29:07AM +0800, Wanpeng Li wrote:
> Ping Andi,
> On Thu, Apr 04, 2013 at 05:09:08PM +0800, Wanpeng Li wrote:
> >order >= MAX_ORDER pages are only allocated at boot stage using the
> >bootmem allocator with the "hugepages=xxx" option. These pages are never
> >free after boot by default since it would be a one-way street(>= MAX_ORDER
> >pages cannot be allocated later), but if administrator confirm not to
> >use these gigantic pages any more, these pinned pages will waste memory
> >since other users can't grab free pages from gigantic hugetlb pool even
> >if OOM, it's not flexible. The patchset add hugetlb gigantic page pools
> >shrink supporting. Administrator can enable knob exported in sysctl to
> >permit to shrink gigantic hugetlb pool.
I originally didn't allow this because it's only one way and it seemed
dubious. I've been recently working on a new patchkit to allocate
GB pages from CMA. With that freeing actually makes sense, as
the pages can be reallocated.
-Andi