2020-01-15 01:28:26

by Mina Almasry

[permalink] [raw]
Subject: [PATCH v10 3/8] hugetlb_cgroup: add reservation accounting for private mappings

Normally the pointer to the cgroup to uncharge hangs off the struct
page, and gets queried when it's time to free the page. With
hugetlb_cgroup reservations, this is not possible. Because it's possible
for a page to be reserved by one task and actually faulted in by another
task.

The best place to put the hugetlb_cgroup pointer to uncharge for
reservations is in the resv_map. But, because the resv_map has different
semantics for private and shared mappings, the code patch to
charge/uncharge shared and private mappings is different. This patch
implements charging and uncharging for private mappings.

For private mappings, the counter to uncharge is in
resv_map->reservation_counter. On initializing the resv_map this is set
to NULL. On reservation of a region in private mapping, the tasks
hugetlb_cgroup is charged and the hugetlb_cgroup is placed is
resv_map->reservation_counter.

On hugetlb_vm_op_close, we uncharge resv_map->reservation_counter.

Signed-off-by: Mina Almasry <[email protected]>

---

Changes in v10:
- Fixed cases where the mapping is private but cgroup accounting is disabled.
Changes in V9:
- Updated for reparenting of hugetlb reservation accounting.

---
include/linux/hugetlb.h | 10 +++++++++
include/linux/hugetlb_cgroup.h | 27 ++++++++++++++++++++++++
mm/hugetlb.c | 38 +++++++++++++++++++++++++++++++++-
mm/hugetlb_cgroup.c | 28 -------------------------
4 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index dea6143aa0685..5491932ea5758 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -46,6 +46,16 @@ struct resv_map {
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * On private mappings, the counter to uncharge reservations is stored
+ * here. If these fields are 0, then either the mapping is shared, or
+ * cgroup accounting is disabled for this resv_map.
+ */
+ struct page_counter *reservation_counter;
+ unsigned long pages_per_hpage;
+ struct cgroup_subsys_state *css;
+#endif
};
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index eab8a70d5bcb5..8c320accefe87 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -25,6 +25,33 @@ struct hugetlb_cgroup;
#define HUGETLB_CGROUP_MIN_ORDER 2

#ifdef CONFIG_CGROUP_HUGETLB
+enum hugetlb_memory_event {
+ HUGETLB_MAX,
+ HUGETLB_NR_MEMORY_EVENTS,
+};
+
+struct hugetlb_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * the counter to account for hugepages from hugetlb.
+ */
+ struct page_counter hugepage[HUGE_MAX_HSTATE];
+
+ /*
+ * the counter to account for hugepage reservations from hugetlb.
+ */
+ struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
+
+ atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+ atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+
+ /* Handle for "hugetlb.events" */
+ struct cgroup_file events_file[HUGE_MAX_HSTATE];
+
+ /* Handle for "hugetlb.events.local" */
+ struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
+};

static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
bool reserved)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 62a4cf3db4090..f1b63946ee95c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -666,6 +666,17 @@ struct resv_map *resv_map_alloc(void)
INIT_LIST_HEAD(&resv_map->regions);

resv_map->adds_in_progress = 0;
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * Initialize these to 0. On shared mappings, 0's here indicate these
+ * fields don't do cgroup accounting. On private mappings, these will be
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
+ * reservations are to be un-charged from here.
+ */
+ resv_map->reservation_counter = NULL;
+ resv_map->pages_per_hpage = 0;
+ resv_map->css = NULL;
+#endif

INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
@@ -3194,7 +3205,11 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)

reserve = (end - start) - region_count(resv, start, end);

- kref_put(&resv->refs, resv_map_release);
+#ifdef CONFIG_CGROUP_HUGETLB
+ hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
+ (end - start) * resv->pages_per_hpage,
+ resv->css);
+#endif

if (reserve) {
/*
@@ -3204,6 +3219,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
+
+ kref_put(&resv->refs, resv_map_release);
}

static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -4550,6 +4567,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
+ struct hugetlb_cgroup *h_cg;
long gbl_reserve;

/* This should never happen */
@@ -4583,12 +4601,30 @@ int hugetlb_reserve_pages(struct inode *inode,
chg = region_chg(resv_map, from, to);

} else {
+ /* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;

chg = to - from;

+ if (hugetlb_cgroup_charge_cgroup(hstate_index(h),
+ chg * pages_per_huge_page(h),
+ &h_cg, true)) {
+ kref_put(&resv_map->refs, resv_map_release);
+ return -ENOMEM;
+ }
+
+#ifdef CONFIG_CGROUP_HUGETLB
+ /*
+ * Since this branch handles private mappings, we attach the
+ * counter to uncharge for this reservation off resv_map.
+ */
+ resv_map->reservation_counter =
+ &h_cg->reserved_hugepage[hstate_index(h)];
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
+#endif
+
set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c434f69f38354..ddfdf19a9ad35 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -23,34 +23,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>

-enum hugetlb_memory_event {
- HUGETLB_MAX,
- HUGETLB_NR_MEMORY_EVENTS,
-};
-
-struct hugetlb_cgroup {
- struct cgroup_subsys_state css;
-
- /*
- * the counter to account for hugepages from hugetlb.
- */
- struct page_counter hugepage[HUGE_MAX_HSTATE];
-
- /*
- * the counter to account for hugepage reservations from hugetlb.
- */
- struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
-
- atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
- atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
-
- /* Handle for "hugetlb.events" */
- struct cgroup_file events_file[HUGE_MAX_HSTATE];
-
- /* Handle for "hugetlb.events.local" */
- struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
-};
-
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
--
2.25.0.rc1.283.g88dfdc4193-goog


2020-01-17 23:00:18

by Mike Kravetz

[permalink] [raw]
Subject: Re: [PATCH v10 3/8] hugetlb_cgroup: add reservation accounting for private mappings

On 1/14/20 5:26 PM, Mina Almasry wrote:
> Normally the pointer to the cgroup to uncharge hangs off the struct
> page, and gets queried when it's time to free the page. With
> hugetlb_cgroup reservations, this is not possible. Because it's possible
> for a page to be reserved by one task and actually faulted in by another
> task.
>
> The best place to put the hugetlb_cgroup pointer to uncharge for
> reservations is in the resv_map. But, because the resv_map has different
> semantics for private and shared mappings, the code patch to
> charge/uncharge shared and private mappings is different. This patch
> implements charging and uncharging for private mappings.
>
> For private mappings, the counter to uncharge is in
> resv_map->reservation_counter. On initializing the resv_map this is set
> to NULL. On reservation of a region in private mapping, the tasks
> hugetlb_cgroup is charged and the hugetlb_cgroup is placed is
> resv_map->reservation_counter.
>
> On hugetlb_vm_op_close, we uncharge resv_map->reservation_counter.
>
> Signed-off-by: Mina Almasry <[email protected]>

Reviewed-by: Mike Kravetz <[email protected]>

--
Mike Kravetz

2020-01-29 21:29:33

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH v10 3/8] hugetlb_cgroup: add reservation accounting for private mappings

On Tue, 14 Jan 2020, Mina Almasry wrote:

> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index dea6143aa0685..5491932ea5758 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -46,6 +46,16 @@ struct resv_map {
> long adds_in_progress;
> struct list_head region_cache;
> long region_cache_count;
> +#ifdef CONFIG_CGROUP_HUGETLB
> + /*
> + * On private mappings, the counter to uncharge reservations is stored
> + * here. If these fields are 0, then either the mapping is shared, or
> + * cgroup accounting is disabled for this resv_map.
> + */
> + struct page_counter *reservation_counter;
> + unsigned long pages_per_hpage;
> + struct cgroup_subsys_state *css;
> +#endif
> };
> extern struct resv_map *resv_map_alloc(void);
> void resv_map_release(struct kref *ref);
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index eab8a70d5bcb5..8c320accefe87 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -25,6 +25,33 @@ struct hugetlb_cgroup;
> #define HUGETLB_CGROUP_MIN_ORDER 2
>
> #ifdef CONFIG_CGROUP_HUGETLB
> +enum hugetlb_memory_event {
> + HUGETLB_MAX,
> + HUGETLB_NR_MEMORY_EVENTS,
> +};
> +
> +struct hugetlb_cgroup {
> + struct cgroup_subsys_state css;
> +
> + /*
> + * the counter to account for hugepages from hugetlb.
> + */
> + struct page_counter hugepage[HUGE_MAX_HSTATE];
> +
> + /*
> + * the counter to account for hugepage reservations from hugetlb.
> + */
> + struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
> +
> + atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> + atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> +
> + /* Handle for "hugetlb.events" */
> + struct cgroup_file events_file[HUGE_MAX_HSTATE];
> +
> + /* Handle for "hugetlb.events.local" */
> + struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
> +};
>
> static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> bool reserved)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 62a4cf3db4090..f1b63946ee95c 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -666,6 +666,17 @@ struct resv_map *resv_map_alloc(void)
> INIT_LIST_HEAD(&resv_map->regions);
>
> resv_map->adds_in_progress = 0;
> +#ifdef CONFIG_CGROUP_HUGETLB
> + /*
> + * Initialize these to 0. On shared mappings, 0's here indicate these
> + * fields don't do cgroup accounting. On private mappings, these will be
> + * re-initialized to the proper values, to indicate that hugetlb cgroup
> + * reservations are to be un-charged from here.
> + */
> + resv_map->reservation_counter = NULL;
> + resv_map->pages_per_hpage = 0;
> + resv_map->css = NULL;
> +#endif

Might be better to extract out a resv_map_init() that does the
initialization when CONFIG_CGROUP_HUGETLB is enabled? Could be used here
as well as hugetlb_reserve_pages().

>
> INIT_LIST_HEAD(&resv_map->region_cache);
> list_add(&rg->link, &resv_map->region_cache);
> @@ -3194,7 +3205,11 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
>
> reserve = (end - start) - region_count(resv, start, end);
>
> - kref_put(&resv->refs, resv_map_release);
> +#ifdef CONFIG_CGROUP_HUGETLB
> + hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
> + (end - start) * resv->pages_per_hpage,
> + resv->css);
> +#endif
>
> if (reserve) {
> /*

Mike has given is Reviewed-by so likely not a big concern for the generic
hugetlb code, but I was wondering if we can reduce the number of #ifdef's
if we defined a CONFIG_CGROUP_HUGETLB helper to take the resv, end, and
start? If CONFIG_CGROUP_HUGETLB is defined, it converts into the above,
otherwise it's a no-op and we don't run into any compile errors because we
are accessing fields that don't exist without the option.

Otherwise looks good!

Acked-by: David Rientjes <[email protected]>

2020-02-03 23:20:02

by Mina Almasry

[permalink] [raw]
Subject: Re: [PATCH v10 3/8] hugetlb_cgroup: add reservation accounting for private mappings

On Wed, Jan 29, 2020 at 1:28 PM David Rientjes <[email protected]> wrote:
>
> On Tue, 14 Jan 2020, Mina Almasry wrote:
>
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index dea6143aa0685..5491932ea5758 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -46,6 +46,16 @@ struct resv_map {
> > long adds_in_progress;
> > struct list_head region_cache;
> > long region_cache_count;
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > + /*
> > + * On private mappings, the counter to uncharge reservations is stored
> > + * here. If these fields are 0, then either the mapping is shared, or
> > + * cgroup accounting is disabled for this resv_map.
> > + */
> > + struct page_counter *reservation_counter;
> > + unsigned long pages_per_hpage;
> > + struct cgroup_subsys_state *css;
> > +#endif
> > };
> > extern struct resv_map *resv_map_alloc(void);
> > void resv_map_release(struct kref *ref);
> > diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> > index eab8a70d5bcb5..8c320accefe87 100644
> > --- a/include/linux/hugetlb_cgroup.h
> > +++ b/include/linux/hugetlb_cgroup.h
> > @@ -25,6 +25,33 @@ struct hugetlb_cgroup;
> > #define HUGETLB_CGROUP_MIN_ORDER 2
> >
> > #ifdef CONFIG_CGROUP_HUGETLB
> > +enum hugetlb_memory_event {
> > + HUGETLB_MAX,
> > + HUGETLB_NR_MEMORY_EVENTS,
> > +};
> > +
> > +struct hugetlb_cgroup {
> > + struct cgroup_subsys_state css;
> > +
> > + /*
> > + * the counter to account for hugepages from hugetlb.
> > + */
> > + struct page_counter hugepage[HUGE_MAX_HSTATE];
> > +
> > + /*
> > + * the counter to account for hugepage reservations from hugetlb.
> > + */
> > + struct page_counter reserved_hugepage[HUGE_MAX_HSTATE];
> > +
> > + atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> > + atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
> > +
> > + /* Handle for "hugetlb.events" */
> > + struct cgroup_file events_file[HUGE_MAX_HSTATE];
> > +
> > + /* Handle for "hugetlb.events.local" */
> > + struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
> > +};
> >
> > static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page,
> > bool reserved)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 62a4cf3db4090..f1b63946ee95c 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -666,6 +666,17 @@ struct resv_map *resv_map_alloc(void)
> > INIT_LIST_HEAD(&resv_map->regions);
> >
> > resv_map->adds_in_progress = 0;
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > + /*
> > + * Initialize these to 0. On shared mappings, 0's here indicate these
> > + * fields don't do cgroup accounting. On private mappings, these will be
> > + * re-initialized to the proper values, to indicate that hugetlb cgroup
> > + * reservations are to be un-charged from here.
> > + */
> > + resv_map->reservation_counter = NULL;
> > + resv_map->pages_per_hpage = 0;
> > + resv_map->css = NULL;
> > +#endif
>
> Might be better to extract out a resv_map_init() that does the
> initialization when CONFIG_CGROUP_HUGETLB is enabled? Could be used here
> as well as hugetlb_reserve_pages().
>
> >
> > INIT_LIST_HEAD(&resv_map->region_cache);
> > list_add(&rg->link, &resv_map->region_cache);
> > @@ -3194,7 +3205,11 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> >
> > reserve = (end - start) - region_count(resv, start, end);
> >
> > - kref_put(&resv->refs, resv_map_release);
> > +#ifdef CONFIG_CGROUP_HUGETLB
> > + hugetlb_cgroup_uncharge_counter(resv->reservation_counter,
> > + (end - start) * resv->pages_per_hpage,
> > + resv->css);
> > +#endif
> >
> > if (reserve) {
> > /*
>
> Mike has given is Reviewed-by so likely not a big concern for the generic
> hugetlb code, but I was wondering if we can reduce the number of #ifdef's
> if we defined a CONFIG_CGROUP_HUGETLB helper to take the resv, end, and
> start? If CONFIG_CGROUP_HUGETLB is defined, it converts into the above,
> otherwise it's a no-op and we don't run into any compile errors because we
> are accessing fields that don't exist without the option.
>

Yes wherever possible I refactored the code a bit to remove #ifdefs in
the middle of hugetlb logic.

> Otherwise looks good!
>
> Acked-by: David Rientjes <[email protected]>