Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
usage or hugetlb reservation counter.
Adds a new interface to uncharge a hugetlb_cgroup counter via
hugetlb_cgroup_uncharge_counter.
Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.
Signed-off-by: Mina Almasry <[email protected]>
---
Changes in v10:
- Added missing VM_BUG_ON
Changes in V9:
- Fixed HUGETLB_CGROUP_MIN_ORDER.
- Minor variable name update.
- Moved some init/cleanup code from later patches in the series to this patch.
- Updated reparenting of reservation accounting.
---
include/linux/hugetlb_cgroup.h | 68 ++++++++++++++---------
mm/hugetlb.c | 19 ++++---
mm/hugetlb_cgroup.c | 99 +++++++++++++++++++++++++---------
3 files changed, 128 insertions(+), 58 deletions(-)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 063962f6dfc6a..eab8a70d5bcb5 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -20,29 +20,37 @@
struct hugetlb_cgroup;
/*
* Minimum page order trackable by hugetlb cgroup.
- * At least 3 pages are necessary for all the tracking information.
+ * At least 4 pages are necessary for all the tracking information.
*/
#define HUGETLB_CGROUP_MIN_ORDER 2
#ifdef CONFIG_CGROUP_HUGETLB
-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
+ bool reserved)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
- return (struct hugetlb_cgroup *)page[2].private;
+ if (reserved)
+ return (struct hugetlb_cgroup *)page[3].private;
+ else
+ return (struct hugetlb_cgroup *)page[2].private;
}
-static inline
-int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
+static inline int set_hugetlb_cgroup(struct page *page,
+ struct hugetlb_cgroup *h_cg,
+ bool reservation)
{
VM_BUG_ON_PAGE(!PageHuge(page), page);
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return -1;
- page[2].private = (unsigned long)h_cg;
+ if (reservation)
+ page[3].private = (unsigned long)h_cg;
+ else
+ page[2].private = (unsigned long)h_cg;
return 0;
}
@@ -52,26 +60,34 @@ static inline bool hugetlb_cgroup_disabled(void)
}
extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup **ptr);
+ struct hugetlb_cgroup **ptr,
+ bool reserved);
extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page);
+ struct page *page, bool reserved);
extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page);
+ struct page *page, bool reserved);
+
extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg);
+ struct hugetlb_cgroup *h_cg,
+ bool reserved);
+extern void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
+ unsigned long nr_pages,
+ struct cgroup_subsys_state *css);
+
extern void hugetlb_cgroup_file_init(void) __init;
extern void hugetlb_cgroup_migrate(struct page *oldhpage,
struct page *newhpage);
#else
-static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
+static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
+ bool reserved)
{
return NULL;
}
-static inline
-int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
+static inline int set_hugetlb_cgroup(struct page *page,
+ struct hugetlb_cgroup *h_cg, bool reserved)
{
return 0;
}
@@ -81,28 +97,30 @@ static inline bool hugetlb_cgroup_disabled(void)
return true;
}
-static inline int
-hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup **ptr)
+static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr,
+ bool reserved)
{
return 0;
}
-static inline void
-hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg,
- struct page *page)
+static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page,
+ bool reserved)
{
}
-static inline void
-hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page)
+static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page,
+ bool reserved)
{
}
-static inline void
-hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg)
+static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
+ unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ bool reserved)
{
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd8737a94bec4..62a4cf3db4090 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1068,7 +1068,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
- VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page, false), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page, true), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
@@ -1178,8 +1179,8 @@ static void __free_huge_page(struct page *page)
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
- hugetlb_cgroup_uncharge_page(hstate_index(h),
- pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h),
+ page, false);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1253,7 +1254,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
spin_lock(&hugetlb_lock);
- set_hugetlb_cgroup(page, NULL);
+ set_hugetlb_cgroup(page, NULL, false);
+ set_hugetlb_cgroup(page, NULL, true);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -2039,7 +2041,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
gbl_chg = 1;
}
- ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg,
+ false);
if (ret)
goto out_subpool_put;
@@ -2063,7 +2066,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page,
+ false);
spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
@@ -2087,7 +2091,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
return page;
out_uncharge_cgroup:
- hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+ hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg,
+ false);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 209f9b9604d34..c434f69f38354 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -96,8 +96,12 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
int idx;
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
- if (page_counter_read(&h_cg->hugepage[idx]))
+ if (page_counter_read(
+ hugetlb_cgroup_get_counter(h_cg, idx, true)) ||
+ page_counter_read(
+ hugetlb_cgroup_get_counter(h_cg, idx, false))) {
return true;
+ }
}
return false;
}
@@ -108,18 +112,33 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
int idx;
for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
- struct page_counter *counter = &h_cgroup->hugepage[idx];
- struct page_counter *parent = NULL;
+ struct page_counter *fault_parent = NULL;
+ struct page_counter *reserved_parent = NULL;
unsigned long limit;
int ret;
- if (parent_h_cgroup)
- parent = &parent_h_cgroup->hugepage[idx];
- page_counter_init(counter, parent);
+ if (parent_h_cgroup) {
+ fault_parent = hugetlb_cgroup_get_counter(
+ parent_h_cgroup, idx, false);
+ reserved_parent = hugetlb_cgroup_get_counter(
+ parent_h_cgroup, idx, true);
+ }
+ page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
+ false),
+ fault_parent);
+ page_counter_init(hugetlb_cgroup_get_counter(h_cgroup, idx,
+ true),
+ reserved_parent);
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_set_max(counter, limit);
+
+ ret = page_counter_set_max(
+ hugetlb_cgroup_get_counter(h_cgroup, idx, false),
+ limit);
+ VM_BUG_ON(ret);
+ ret = page_counter_set_max(
+ hugetlb_cgroup_get_counter(h_cgroup, idx, true), limit);
VM_BUG_ON(ret);
}
}
@@ -149,7 +168,6 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(h_cgroup);
}
-
/*
* Should be called with hugetlb_lock held.
* Since we are holding hugetlb_lock, pages cannot get moved from
@@ -165,7 +183,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
struct hugetlb_cgroup *page_hcg;
struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
- page_hcg = hugetlb_cgroup_from_page(page);
+ page_hcg = hugetlb_cgroup_from_page(page, false);
/*
* We can have pages in active list without any cgroup
* ie, hugepage with less than 3 pages. We can safely
@@ -184,7 +202,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
/* Take the pages off the local counter */
page_counter_cancel(counter, nr_pages);
- set_hugetlb_cgroup(page, parent);
+ set_hugetlb_cgroup(page, parent, false);
out:
return;
}
@@ -227,7 +245,7 @@ static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup **ptr)
+ struct hugetlb_cgroup **ptr, bool reserved)
{
int ret = 0;
struct page_counter *counter;
@@ -250,13 +268,20 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
}
rcu_read_unlock();
- if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
- &counter)) {
+ if (!page_counter_try_charge(hugetlb_cgroup_get_counter(h_cg, idx,
+ reserved),
+ nr_pages, &counter)) {
ret = -ENOMEM;
hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx,
HUGETLB_MAX);
+ css_put(&h_cg->css);
+ goto done;
}
- css_put(&h_cg->css);
+ /* Reservations take a reference to the css because they do not get
+ * reparented.
+ */
+ if (!reserved)
+ css_put(&h_cg->css);
done:
*ptr = h_cg;
return ret;
@@ -265,12 +290,12 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
/* Should be called with hugetlb_lock held */
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
struct hugetlb_cgroup *h_cg,
- struct page *page)
+ struct page *page, bool reserved)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
- set_hugetlb_cgroup(page, h_cg);
+ set_hugetlb_cgroup(page, h_cg, reserved);
return;
}
@@ -278,23 +303,29 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
* Should be called with hugetlb_lock held
*/
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
- struct page *page)
+ struct page *page, bool reserved)
{
struct hugetlb_cgroup *h_cg;
if (hugetlb_cgroup_disabled())
return;
lockdep_assert_held(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(page);
+ h_cg = hugetlb_cgroup_from_page(page, reserved);
if (unlikely(!h_cg))
return;
- set_hugetlb_cgroup(page, NULL);
- page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
+ set_hugetlb_cgroup(page, NULL, reserved);
+
+ page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
+ nr_pages);
+
+ if (reserved)
+ css_put(&h_cg->css);
+
return;
}
void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
- struct hugetlb_cgroup *h_cg)
+ struct hugetlb_cgroup *h_cg, bool reserved)
{
if (hugetlb_cgroup_disabled() || !h_cg)
return;
@@ -302,8 +333,22 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
return;
- page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
- return;
+ page_counter_uncharge(hugetlb_cgroup_get_counter(h_cg, idx, reserved),
+ nr_pages);
+
+ if (reserved)
+ css_put(&h_cg->css);
+}
+
+void hugetlb_cgroup_uncharge_counter(struct page_counter *p,
+ unsigned long nr_pages,
+ struct cgroup_subsys_state *css)
+{
+ if (hugetlb_cgroup_disabled() || !p || !css)
+ return;
+
+ page_counter_uncharge(p, nr_pages);
+ css_put(css);
}
enum {
@@ -675,6 +720,7 @@ void __init hugetlb_cgroup_file_init(void)
void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
{
struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg_reservation;
struct hstate *h = page_hstate(oldhpage);
if (hugetlb_cgroup_disabled())
@@ -682,11 +728,12 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
- h_cg = hugetlb_cgroup_from_page(oldhpage);
- set_hugetlb_cgroup(oldhpage, NULL);
+ h_cg = hugetlb_cgroup_from_page(oldhpage, false);
+ h_cg_reservation = hugetlb_cgroup_from_page(oldhpage, true);
+ set_hugetlb_cgroup(oldhpage, NULL, false);
/* move the h_cg details to new cgroup */
- set_hugetlb_cgroup(newhpage, h_cg);
+ set_hugetlb_cgroup(newhpage, h_cg_reservation, true);
list_move(&newhpage->lru, &h->hugepage_activelist);
spin_unlock(&hugetlb_lock);
return;
--
2.25.0.rc1.283.g88dfdc4193-goog
On 1/14/20 5:26 PM, Mina Almasry wrote:
> Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
> usage or hugetlb reservation counter.
>
> Adds a new interface to uncharge a hugetlb_cgroup counter via
> hugetlb_cgroup_uncharge_counter.
>
> Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
> hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.
>
> Signed-off-by: Mina Almasry <[email protected]>
>
> ---
>
> Changes in v10:
> - Added missing VM_BUG_ON
Thanks for addressing my comments.
I see that patch 8 was updated to address David's comments. I will also
review patch 8 later.
I am again requesting that someone with more cgroup experience comment on
this patch. I do not have enough experience to determine if the code to
not reparent/zombie cgroup is correct.
For now,
Acked-by: Mike Kravetz <[email protected]>
while waiting for cgroup comments.
--
Mike Kravetz
On Fri, Jan 17, 2020 at 11:26 AM Mike Kravetz <[email protected]> wrote:
>
> On 1/14/20 5:26 PM, Mina Almasry wrote:
> > Augments hugetlb_cgroup_charge_cgroup to be able to charge hugetlb
> > usage or hugetlb reservation counter.
> >
> > Adds a new interface to uncharge a hugetlb_cgroup counter via
> > hugetlb_cgroup_uncharge_counter.
> >
> > Integrates the counter with hugetlb_cgroup, via hugetlb_cgroup_init,
> > hugetlb_cgroup_have_usage, and hugetlb_cgroup_css_offline.
> >
> > Signed-off-by: Mina Almasry <[email protected]>
> >
> > ---
> >
> > Changes in v10:
> > - Added missing VM_BUG_ON
>
> Thanks for addressing my comments.
> I see that patch 8 was updated to address David's comments. I will also
> review patch 8 later.
>
> I am again requesting that someone with more cgroup experience comment on
> this patch. I do not have enough experience to determine if the code to
> not reparent/zombie cgroup is correct.
>
Thank you very much for your continued reviews and I'm sorry I'm
having so much trouble getting anyone with cgroup experience to look
deeply into this. My guess is that it's not very active feature and
the patchseries is long and complex so folks are not interested.
Nevertheless I'm continually poking David and Shakeel and they will
hopefully continue to look into this.
FWIW, I have a test for repartenting/counting zombies and the tests at
least don't uncover any problems.
> For now,
> Acked-by: Mike Kravetz <[email protected]>
> while waiting for cgroup comments.
>
> --
> Mike Kravetz
On Tue, 14 Jan 2020, Mina Almasry wrote:
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index 063962f6dfc6a..eab8a70d5bcb5 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -20,29 +20,37 @@
> struct hugetlb_cgroup;
> /*
> * Minimum page order trackable by hugetlb cgroup.
> - * At least 3 pages are necessary for all the tracking information.
> + * At least 4 pages are necessary for all the tracking information.
> */
> #define HUGETLB_CGROUP_MIN_ORDER 2
I always struggle with a way to document and protect these types of
usages. In this case, we are using the private filed of tail pages; in
thp code, we enumerate these usages separately in struct page: see "Tail
pages of compound page" comment in the union. Using the private field is
fine to store a pointer to the hugetlb_cgroup, but I'm wondering if we can
document or protect against future patches not understanding this usage.
Otherwise it's implicit beyond this comment.
Maybe an expanded comment here is the only thing that is needed because
it's unique to struct hugetlb_cgroup that describes what struct page
represents for the second, third, and (now) fourth tail page.
Or, we could go even more explicit and define an enum that defines the
third tail page is for hugetlb_cgroup limits and fourth is for
hugetlb_cgroup reservation limits.
enum {
...
H_CG_LIMIT,
H_CG_RESV,
MAX_H_CG_PRIVATE,
};
and then do a
BUILD_BUG_ON(MAX_H_CG_PRIVATE > (1 << HUGETLB_CGROUP_MIN_ORDER));
somewhere? And then use H_CG_RESV (or a better name) as the offset from
the head page in the code?
>
> #ifdef CONFIG_CGROUP_HUGETLB
>
> -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page)
> +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> + bool reserved)
When implementing specs, I immediately see "bool reserved" or
"int reserved:1" as reserved for future use :) Maybe simple enough to
just name these formals as resv?
Otherwise looks like a simple conversion of existing functionality and
extension to the new limits.
Acked-by: David Rientjes <[email protected]>
On 1/29/20 1:21 PM, David Rientjes wrote:
> On Tue, 14 Jan 2020, Mina Almasry wrote:
>
>> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
>> index 063962f6dfc6a..eab8a70d5bcb5 100644
>> --- a/include/linux/hugetlb_cgroup.h
>> +++ b/include/linux/hugetlb_cgroup.h
>> @@ -20,29 +20,37 @@
>> struct hugetlb_cgroup;
>> /*
>> * Minimum page order trackable by hugetlb cgroup.
>> - * At least 3 pages are necessary for all the tracking information.
>> + * At least 4 pages are necessary for all the tracking information.
>> */
>> #define HUGETLB_CGROUP_MIN_ORDER 2
>
> I always struggle with a way to document and protect these types of
> usages. In this case, we are using the private filed of tail pages; in
> thp code, we enumerate these usages separately in struct page: see "Tail
> pages of compound page" comment in the union. Using the private field is
> fine to store a pointer to the hugetlb_cgroup, but I'm wondering if we can
> document or protect against future patches not understanding this usage.
> Otherwise it's implicit beyond this comment.
>
> Maybe an expanded comment here is the only thing that is needed because
> it's unique to struct hugetlb_cgroup that describes what struct page
> represents for the second, third, and (now) fourth tail page.
I think that expanding the comment may be sufficient. Let's at least
document what the private field of the of the tail pages are used for
WRT cgroups.
Second tail page (hpage[2]) usage cgroup
Third tail page (hpage[3]) reservation cgroup
BTW, we are just adding usage of the third tail page IIUC. The comment
that 4 pages are necessary includes the head page.
--
Mike Kravetz