From: Yang Yang <[email protected]>
Using THP may promote the performance of memory, but increase memory
footprint. Applications may use madvise to decrease footprint, but
not all applications support using madvise, and it takes much costs
to re-code all the applications. And we notice container becomes more
and more popular to manage a set of tasks.
So add support for cgroup to control THP behaviour will provide much
convenience, administrator may only enable THP for important containers,
and disable it for other containers. Then we can enjoy the high performance
of THP while minimize memory footprint without re-coding any application.
Cgroupv1 is used for many distributions, so and this it.
Signed-off-by: Yang Yang <[email protected]>
---
include/linux/huge_mm.h | 33 +--------------
include/linux/khugepaged.h | 19 +++------
include/linux/memcontrol.h | 53 ++++++++++++++++++++++++
mm/huge_memory.c | 34 ++++++++++++++++
mm/khugepaged.c | 36 ++++++++++++++++-
mm/memcontrol.c | 82 ++++++++++++++++++++++++++++++++++++++
6 files changed, 211 insertions(+), 46 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fbf36bb1be22..fa2cb3d06ecb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -141,38 +141,6 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
return true;
}
-/*
- * to be used on vmas which are known to support THP.
- * Use transparent_hugepage_active otherwise
- */
-static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
-{
-
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
- return false;
-
- if (!transhuge_vma_enabled(vma, vma->vm_flags))
- return false;
-
- if (vma_is_temporary_stack(vma))
- return false;
-
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
- return true;
-
- if (vma_is_dax(vma))
- return true;
-
- if (transparent_hugepage_flags &
- (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
- return !!(vma->vm_flags & VM_HUGEPAGE);
-
- return false;
-}
-
bool transparent_hugepage_active(struct vm_area_struct *vma);
#define transparent_hugepage_use_zero_page() \
@@ -302,6 +270,7 @@ static inline struct list_head *page_deferred_list(struct page *page)
*/
return &page[2].deferred_list;
}
+inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma);
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 2fcc01891b47..b77b065ebf16 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -26,16 +26,9 @@ static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
}
#endif
-#define khugepaged_enabled() \
- (transparent_hugepage_flags & \
- ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
- (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define khugepaged_always() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_FLAG))
-#define khugepaged_req_madv() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+extern inline int khugepaged_enabled(void);
+extern inline int khugepaged_always(struct vm_area_struct *vma);
+extern inline int khugepaged_req_madv(struct vm_area_struct *vma);
#define khugepaged_defrag() \
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
@@ -57,9 +50,9 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
unsigned long vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
- if ((khugepaged_always() ||
- (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
- (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
+ if ((khugepaged_always(vma) ||
+ (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
+ (khugepaged_req_madv(vma) && (vm_flags & VM_HUGEPAGE))) &&
!(vm_flags & VM_NOHUGEPAGE) &&
!test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
if (__khugepaged_enter(vma->vm_mm))
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8ea4b541c31e..c5f9f4b267bd 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -28,6 +28,13 @@ struct page;
struct mm_struct;
struct kmem_cache;
+/*
+ * Increase when sub cgroup enable transparent hugepage, decrease when
+ * sub cgroup disable transparent hugepage. Help decide whether to run
+ * khugepaged.
+ */
+extern atomic_t sub_thp_count;
+
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
@@ -343,6 +350,7 @@ struct mem_cgroup {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
+ unsigned long thp_flag;
struct mem_cgroup_per_node *nodeinfo[];
};
@@ -1127,6 +1135,32 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
+static inline unsigned long mem_cgroup_thp_flag(struct mem_cgroup *memcg)
+{
+ if (unlikely(memcg == NULL) || mem_cgroup_disabled() ||
+ mem_cgroup_is_root(memcg))
+ return transparent_hugepage_flags;
+
+ return memcg->thp_flag;
+}
+
+static inline int memcg_sub_thp_enabled(void)
+{
+ return atomic_read(&sub_thp_count) != 0;
+}
+
+static inline void memcg_sub_thp_enable(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_is_root(memcg))
+ atomic_inc(&sub_thp_count);
+}
+
+static inline void memcg_sub_thp_disable(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_is_root(memcg))
+ atomic_dec(&sub_thp_count);
+}
+
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
@@ -1524,6 +1558,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
return 0;
}
+
+static inline unsigned long mem_cgroup_thp_flag(struct mem_cgroup *memcg)
+{
+ return transparent_hugepage_flags;
+}
+
+static inline int memcg_sub_thp_enabled(void)
+{
+ return 0;
+}
+
+static inline void memcg_sub_thp_enable(struct mem_cgroup *memcg)
+{
+}
+
+static inline void memcg_sub_thp_disable(struct mem_cgroup *memcg)
+{
+}
+
#endif /* CONFIG_MEMCG */
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6bf0ec9ac4e4..09c80b6a18d6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3173,4 +3173,38 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
update_mmu_cache_pmd(vma, address, pvmw->pmd);
trace_remove_migration_pmd(address, pmd_val(pmde));
}
+
+/*
+ * to be used on vmas which are known to support THP.
+ * Use transparent_hugepage_active otherwise
+ */
+inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(vma->vm_mm);
+
+ /*
+ * If the hardware/firmware marked hugepage support disabled.
+ */
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+ return false;
+
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
+ return false;
+
+ if (vma_is_temporary_stack(vma))
+ return false;
+
+ if (mem_cgroup_thp_flag(memcg) & (1 << TRANSPARENT_HUGEPAGE_FLAG))
+ return true;
+
+ if (vma_is_dax(vma))
+ return true;
+
+ if (mem_cgroup_thp_flag(memcg) &
+ (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+ return !!(vma->vm_flags & VM_HUGEPAGE);
+
+ return false;
+}
+
#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eb444fd45568..8386d8d1d423 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -454,7 +454,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
return shmem_huge_enabled(vma);
/* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always(vma))
return false;
/* Only regular file is valid */
@@ -1537,6 +1537,40 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
goto drop_hpage;
}
+inline int khugepaged_enabled(void)
+{
+ if ((transparent_hugepage_flags &
+ ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) ||
+ memcg_sub_thp_enabled())
+ return 1;
+ else
+ return 0;
+}
+
+inline int khugepaged_req_madv(struct vm_area_struct *vma)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(vma->vm_mm);
+
+ if (mem_cgroup_thp_flag(memcg) &
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+ return 1;
+ else
+ return 0;
+}
+
+inline int khugepaged_always(struct vm_area_struct *vma)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(vma->vm_mm);
+
+ if (mem_cgroup_thp_flag(memcg) &
+ (1<<TRANSPARENT_HUGEPAGE_FLAG))
+ return 1;
+ else
+ return 0;
+}
+
+
static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1b5823ac060..1372324f76e3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/khugepaged.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -99,6 +100,8 @@ static bool cgroup_memory_noswap __ro_after_init;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
+atomic_t sub_thp_count __read_mostly = ATOMIC_INIT(0);
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -4823,6 +4826,71 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
}
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int mem_cgroup_thp_flag_show(struct seq_file *sf, void *v)
+{
+ const char *output;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
+ unsigned long flag = mem_cgroup_thp_flag(memcg);
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &flag))
+ output = "[always] madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &flag))
+ output = "always [madvise] never";
+ else
+ output = "always madvise [never]";
+
+ seq_printf(sf, "%s\n", output);
+ return 0;
+}
+
+static ssize_t mem_cgroup_thp_flag_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ ssize_t ret = nbytes;
+ unsigned long *flag;
+
+ if (!mem_cgroup_is_root(memcg))
+ flag = &memcg->thp_flag;
+ else
+ flag = &transparent_hugepage_flags;
+
+ if (sysfs_streq(buf, "always")) {
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, flag)) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG, flag);
+ /* change disable to enable */
+ if (!test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flag))
+ memcg_sub_thp_enable(memcg);
+ }
+ } else if (sysfs_streq(buf, "madvise")) {
+ if (!test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flag)) {
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flag);
+ /* change disable to enable */
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, flag))
+ memcg_sub_thp_enable(memcg);
+ }
+ } else if (sysfs_streq(buf, "never")) {
+ /* change enable to disable */
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, flag) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flag)) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, flag);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flag);
+ memcg_sub_thp_disable(memcg);
+ }
+ } else
+ ret = -EINVAL;
+
+ if (ret > 0) {
+ int err = start_stop_khugepaged();
+
+ if (err)
+ ret = err;
+ }
+ return ret;
+}
+#endif
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -4948,6 +5016,13 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ {
+ .name = "transparent_hugepage.enabled",
+ .seq_show = mem_cgroup_thp_flag_show,
+ .write = mem_cgroup_thp_flag_write,
+ },
+#endif
{ }, /* terminate */
};
@@ -5145,8 +5220,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->thp_flag = mem_cgroup_thp_flag(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
+ if (memcg->thp_flag &
+ ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+ memcg_sub_thp_enable(memcg);
+
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -5220,6 +5301,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
+ memcg_sub_thp_disable(memcg);
drain_all_stock(memcg);
--
2.25.1
Hi,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.18-rc5]
[cannot apply to hnaz-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 107c948d1d3e61d10aee9d0f7c3d81bbee9842af
config: i386-randconfig-s001 (https://download.01.org/0day-ci/archive/20220505/[email protected]/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.4-dirty
# https://github.com/intel-lab-lkp/linux/commit/f08a35b9798572693a91c6a3d823ed9ae54ef688
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
git checkout f08a35b9798572693a91c6a3d823ed9ae54ef688
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
sparse warnings: (new ones prefixed by >>)
mm/huge_memory.c: note: in included file (through include/linux/mm.h):
>> include/linux/huge_mm.h:272:43: sparse: sparse: marked inline, but without a definition
mm/huge_memory.c: note: in included file:
>> include/linux/khugepaged.h:30:36: sparse: sparse: marked inline, but without a definition
include/linux/khugepaged.h:31:38: sparse: sparse: marked inline, but without a definition
--
mm/memory.c:1024:17: sparse: sparse: context imbalance in 'copy_pte_range' - different lock contexts for basic block
mm/memory.c:1752:16: sparse: sparse: context imbalance in '__get_locked_pte' - different lock contexts for basic block
mm/memory.c:1800:9: sparse: sparse: context imbalance in 'insert_page' - different lock contexts for basic block
mm/memory.c:2302:17: sparse: sparse: context imbalance in 'remap_pte_range' - different lock contexts for basic block
mm/memory.c:2558:17: sparse: sparse: context imbalance in 'apply_to_pte_range' - unexpected unlock
mm/memory.c:2847:9: sparse: sparse: context imbalance in 'wp_page_copy' - different lock contexts for basic block
mm/memory.c:3185:17: sparse: sparse: context imbalance in 'wp_pfn_shared' - unexpected unlock
mm/memory.c:3248:19: sparse: sparse: context imbalance in 'do_wp_page' - different lock contexts for basic block
mm/memory.c: note: in included file (through include/linux/mm.h):
>> include/linux/huge_mm.h:272:43: sparse: sparse: marked inline, but without a definition
>> include/linux/huge_mm.h:272:43: sparse: sparse: marked inline, but without a definition
--
mm/shmem.c: note: in included file:
>> include/linux/khugepaged.h:30:36: sparse: sparse: marked inline, but without a definition
include/linux/khugepaged.h:31:38: sparse: sparse: marked inline, but without a definition
>> include/linux/khugepaged.h:30:36: sparse: sparse: marked inline, but without a definition
include/linux/khugepaged.h:31:38: sparse: sparse: marked inline, but without a definition
vim +272 include/linux/huge_mm.h
263
264 static inline struct list_head *page_deferred_list(struct page *page)
265 {
266 /*
267 * Global or memcg deferred list in the second tail pages is
268 * occupied by compound_head.
269 */
270 return &page[2].deferred_list;
271 }
> 272 inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma);
273
--
0-DAY CI Kernel Test Service
https://01.org/lkp
Hi,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.18-rc5 next-20220505]
[cannot apply to hnaz-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 107c948d1d3e61d10aee9d0f7c3d81bbee9842af
config: i386-randconfig-a001 (https://download.01.org/0day-ci/archive/20220505/[email protected]/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/f08a35b9798572693a91c6a3d823ed9ae54ef688
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
git checkout f08a35b9798572693a91c6a3d823ed9ae54ef688
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
ld: mm/memory.o: in function `__handle_mm_fault':
>> memory.c:(.text+0xc6f9): undefined reference to `__transparent_hugepage_enabled'
>> ld: memory.c:(.text+0xc751): undefined reference to `__transparent_hugepage_enabled'
ld: mm/huge_memory.o: in function `transparent_hugepage_active':
>> huge_memory.c:(.text+0x6351): undefined reference to `__transparent_hugepage_enabled'
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On Fri, May 06, 2022 at 03:41:50PM +0200, Michal Hocko wrote:
> On Thu 05-05-22 03:38:15, [email protected] wrote:
> > From: Yang Yang <[email protected]>
> >
> > Using THP may promote the performance of memory, but increase memory
> > footprint. Applications may use madvise to decrease footprint, but
> > not all applications support using madvise, and it takes much costs
> > to re-code all the applications. And we notice container becomes more
> > and more popular to manage a set of tasks.
>
> Could you be more specific about the actual usecase? When do you group
> processes based on their general THP reqirements? You are mentioning
> containers but those are usually bags of different processes that just
> share a common objective.
>
> > So add support for cgroup to control THP behaviour will provide much
> > convenience, administrator may only enable THP for important containers,
> > and disable it for other containers.
>
> Why would that be a matter of importance?
>
> Also what is actual semantic when processes living inside those cgroups
> explicitly state their THP requirements?
>
Docker might support this new cgroup knob in the future, add provide UI likes:
# docker run -it --thp-enabled=[always,never,madvise]
The cmdline format from https://docs.docker.com/engine/reference/run/
If there are many containers to run on one host, and some of them have high
performance requirements, administrator could turn on thp for them:
# docker run -it --thp-enabled=always
Then all the processes in those containers will always use thp.
While other containers turn off thp by:
# docker run -it --thp-enabled=never
By doing this we could promote important containers's performance with less
footprint of thp.
> --
> Michal Hocko
> SUSE Labs
Hi,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linux/master]
[also build test ERROR on linus/master v5.18-rc5 next-20220505]
[cannot apply to hnaz-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 107c948d1d3e61d10aee9d0f7c3d81bbee9842af
config: mips-decstation_64_defconfig (https://download.01.org/0day-ci/archive/20220505/[email protected]/config)
compiler: mips64el-linux-gcc (GCC) 11.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/f08a35b9798572693a91c6a3d823ed9ae54ef688
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review cgel-zte-gmail-com/mm-memcg-support-control-THP-behaviour-in-cgroup/20220505-114028
git checkout f08a35b9798572693a91c6a3d823ed9ae54ef688
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=mips SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <[email protected]>
All errors (new ones prefixed by >>):
mips64el-linux-ld: mm/memory.o: in function `__handle_mm_fault':
memory.c:(.text+0x60a8): undefined reference to `__transparent_hugepage_enabled'
>> mips64el-linux-ld: memory.c:(.text+0x6108): undefined reference to `__transparent_hugepage_enabled'
mips64el-linux-ld: mm/huge_memory.o: in function `transparent_hugepage_active':
huge_memory.c:(.text+0x21e4): undefined reference to `__transparent_hugepage_enabled'
--
0-DAY CI Kernel Test Service
https://01.org/lkp
On Sat 07-05-22 02:05:25, CGEL wrote:
[...]
> If there are many containers to run on one host, and some of them have high
> performance requirements, administrator could turn on thp for them:
> # docker run -it --thp-enabled=always
> Then all the processes in those containers will always use thp.
> While other containers turn off thp by:
> # docker run -it --thp-enabled=never
I do not know. The THP config space is already too confusing and complex
and this just adds on top. E.g. is the behavior of the knob
hierarchical? What is the policy if parent memcg says madivise while
child says always? How does the per-application configuration aligns
with all that (e.g. memcg policy madivise but application says never via
prctl while still uses some madvised - e.g. via library).
> By doing this we could promote important containers's performance with less
> footprint of thp.
Do we really want to provide something like THP based QoS? To me it
sounds like a bad idea and if the justification is "it might be useful"
then I would say no. So you really need to come with a very good usecase
to promote this further.
--
Michal Hocko
SUSE Labs
On Thu 05-05-22 03:38:15, [email protected] wrote:
> From: Yang Yang <[email protected]>
>
> Using THP may promote the performance of memory, but increase memory
> footprint. Applications may use madvise to decrease footprint, but
> not all applications support using madvise, and it takes much costs
> to re-code all the applications. And we notice container becomes more
> and more popular to manage a set of tasks.
Could you be more specific about the actual usecase? When do you group
processes based on their general THP reqirements? You are mentioning
containers but those are usually bags of different processes that just
share a common objective.
> So add support for cgroup to control THP behaviour will provide much
> convenience, administrator may only enable THP for important containers,
> and disable it for other containers.
Why would that be a matter of importance?
Also what is actual semantic when processes living inside those cgroups
explicitly state their THP requirements?
--
Michal Hocko
SUSE Labs
On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> On Sat 07-05-22 02:05:25, CGEL wrote:
> [...]
> > If there are many containers to run on one host, and some of them have high
> > performance requirements, administrator could turn on thp for them:
> > # docker run -it --thp-enabled=always
> > Then all the processes in those containers will always use thp.
> > While other containers turn off thp by:
> > # docker run -it --thp-enabled=never
>
> I do not know. The THP config space is already too confusing and complex
> and this just adds on top. E.g. is the behavior of the knob
> hierarchical? What is the policy if parent memcg says madivise while
> child says always? How does the per-application configuration aligns
> with all that (e.g. memcg policy madivise but application says never via
> prctl while still uses some madvised - e.g. via library).
>
The cgroup THP behavior is align to host and totally independent just likes
/sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
for thp, it has no matter with host or other cgroup. This make it simple for
user to understand or control.
If memcg policy madivise but application says never, just like host, the result
is no THP for that application.
> > By doing this we could promote important containers's performance with less
> > footprint of thp.
>
> Do we really want to provide something like THP based QoS? To me it
> sounds like a bad idea and if the justification is "it might be useful"
> then I would say no. So you really need to come with a very good usecase
> to promote this further.
At least on some 5G(communication technology) machine, it's useful to provide
THP based QoS. Those 5G machine use micro-service software architecture, in
other words one service application runs in one container. Container becomes
the suitable management unit but not the whole host. And some performance
sensitive containers desiderate THP to provide low latency communication.
But if we use THP with 'always', it will consume more memory(on our machine
that is about 10% of total memory). And unnecessary huge pages will increase
memory pressure, add latency for minor pages faults, and add overhead when
splitting huge pages or coalescing normal sized pages into huge pages.
So container manager should welcome cgroup based THP QoS.
> --
> Michal Hocko
> SUSE Labs
On Mon 09-05-22 11:26:43, CGEL wrote:
> On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > On Sat 07-05-22 02:05:25, CGEL wrote:
> > [...]
> > > If there are many containers to run on one host, and some of them have high
> > > performance requirements, administrator could turn on thp for them:
> > > # docker run -it --thp-enabled=always
> > > Then all the processes in those containers will always use thp.
> > > While other containers turn off thp by:
> > > # docker run -it --thp-enabled=never
> >
> > I do not know. The THP config space is already too confusing and complex
> > and this just adds on top. E.g. is the behavior of the knob
> > hierarchical? What is the policy if parent memcg says madivise while
> > child says always? How does the per-application configuration aligns
> > with all that (e.g. memcg policy madivise but application says never via
> > prctl while still uses some madvised - e.g. via library).
> >
>
> The cgroup THP behavior is align to host and totally independent just likes
> /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> for thp, it has no matter with host or other cgroup. This make it simple for
> user to understand or control.
All controls in cgroup v2 should be hierarchical. This is really
required for a proper delegation semantic.
> If memcg policy madivise but application says never, just like host, the result
> is no THP for that application.
>
> > > By doing this we could promote important containers's performance with less
> > > footprint of thp.
> >
> > Do we really want to provide something like THP based QoS? To me it
> > sounds like a bad idea and if the justification is "it might be useful"
> > then I would say no. So you really need to come with a very good usecase
> > to promote this further.
>
> At least on some 5G(communication technology) machine, it's useful to provide
> THP based QoS. Those 5G machine use micro-service software architecture, in
> other words one service application runs in one container.
I am not really sure I understand. If this is one application per
container (cgroup) then why do you really need per-group setting?
Does the application is a set of different processes which are only very
loosely tight?
> Container becomes
> the suitable management unit but not the whole host. And some performance
> sensitive containers desiderate THP to provide low latency communication.
> But if we use THP with 'always', it will consume more memory(on our machine
> that is about 10% of total memory). And unnecessary huge pages will increase
> memory pressure, add latency for minor pages faults, and add overhead when
> splitting huge pages or coalescing normal sized pages into huge pages.
It is still not really clear to me how do you achieve that the whole
workload in the said container has the same THP requirements.
--
Michal Hocko
SUSE Labs
On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> On Mon 09-05-22 11:26:43, CGEL wrote:
> > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > [...]
> > > > If there are many containers to run on one host, and some of them have high
> > > > performance requirements, administrator could turn on thp for them:
> > > > # docker run -it --thp-enabled=always
> > > > Then all the processes in those containers will always use thp.
> > > > While other containers turn off thp by:
> > > > # docker run -it --thp-enabled=never
> > >
> > > I do not know. The THP config space is already too confusing and complex
> > > and this just adds on top. E.g. is the behavior of the knob
> > > hierarchical? What is the policy if parent memcg says madivise while
> > > child says always? How does the per-application configuration aligns
> > > with all that (e.g. memcg policy madivise but application says never via
> > > prctl while still uses some madvised - e.g. via library).
> > >
> >
> > The cgroup THP behavior is align to host and totally independent just likes
> > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > for thp, it has no matter with host or other cgroup. This make it simple for
> > user to understand or control.
>
> All controls in cgroup v2 should be hierarchical. This is really
> required for a proper delegation semantic.
>
Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
Some distributions like Ubuntu is still using cgroup v1.
> > If memcg policy madivise but application says never, just like host, the result
> > is no THP for that application.
> >
> > > > By doing this we could promote important containers's performance with less
> > > > footprint of thp.
> > >
> > > Do we really want to provide something like THP based QoS? To me it
> > > sounds like a bad idea and if the justification is "it might be useful"
> > > then I would say no. So you really need to come with a very good usecase
> > > to promote this further.
> >
> > At least on some 5G(communication technology) machine, it's useful to provide
> > THP based QoS. Those 5G machine use micro-service software architecture, in
> > other words one service application runs in one container.
>
> I am not really sure I understand. If this is one application per
> container (cgroup) then why do you really need per-group setting?
> Does the application is a set of different processes which are only very
> loosely tight?
>
For micro-service architecture, the application in one container is not a
set of loosely tight processes, it's aim at provide one certain service,
so different containers means different service, and different service
has different QoS demand.
The reason why we need per-group(per-container) setting is because most
container are managed by compose software, the compose software provide
UI to decide how to run a container(likes setting swappiness value). For
example the docker compose:
https://docs.docker.com/compose/#compose-v2-and-the-new-docker-compose-command
To make it clearer, I try to make a summary for why container needs this patch:
1.one machine can run different containers;
2.for some scenario, container runs only one service inside(can be only one
application);
3.different containers provide different services, different services have
different QoS demands;
4.THP has big influence on QoS. It's fast for memory access, but eat more
memory;
5.containers usually managed by compose software, which treats container as
base management unit;
6.this patch provide cgroup THP controller, which can be a method to adjust
container memory QoS.
> > Container becomes
> > the suitable management unit but not the whole host. And some performance
> > sensitive containers desiderate THP to provide low latency communication.
> > But if we use THP with 'always', it will consume more memory(on our machine
> > that is about 10% of total memory). And unnecessary huge pages will increase
> > memory pressure, add latency for minor pages faults, and add overhead when
> > splitting huge pages or coalescing normal sized pages into huge pages.
>
> It is still not really clear to me how do you achieve that the whole
> workload in the said container has the same THP requirements.
> --
> Michal Hocko
> SUSE Labs
On Tue 10-05-22 01:43:38, CGEL wrote:
> On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > [...]
> > > > > If there are many containers to run on one host, and some of them have high
> > > > > performance requirements, administrator could turn on thp for them:
> > > > > # docker run -it --thp-enabled=always
> > > > > Then all the processes in those containers will always use thp.
> > > > > While other containers turn off thp by:
> > > > > # docker run -it --thp-enabled=never
> > > >
> > > > I do not know. The THP config space is already too confusing and complex
> > > > and this just adds on top. E.g. is the behavior of the knob
> > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > child says always? How does the per-application configuration aligns
> > > > with all that (e.g. memcg policy madivise but application says never via
> > > > prctl while still uses some madvised - e.g. via library).
> > > >
> > >
> > > The cgroup THP behavior is align to host and totally independent just likes
> > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > user to understand or control.
> >
> > All controls in cgroup v2 should be hierarchical. This is really
> > required for a proper delegation semantic.
> >
>
> Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> Some distributions like Ubuntu is still using cgroup v1.
cgroup v1 interface is mostly frozen. All new features are added to the
v2 interface.
> > > If memcg policy madivise but application says never, just like host, the result
> > > is no THP for that application.
> > >
> > > > > By doing this we could promote important containers's performance with less
> > > > > footprint of thp.
> > > >
> > > > Do we really want to provide something like THP based QoS? To me it
> > > > sounds like a bad idea and if the justification is "it might be useful"
> > > > then I would say no. So you really need to come with a very good usecase
> > > > to promote this further.
> > >
> > > At least on some 5G(communication technology) machine, it's useful to provide
> > > THP based QoS. Those 5G machine use micro-service software architecture, in
> > > other words one service application runs in one container.
> >
> > I am not really sure I understand. If this is one application per
> > container (cgroup) then why do you really need per-group setting?
> > Does the application is a set of different processes which are only very
> > loosely tight?
> >
> For micro-service architecture, the application in one container is not a
> set of loosely tight processes, it's aim at provide one certain service,
> so different containers means different service, and different service
> has different QoS demand.
OK, if they are tightly coupled you could apply the same THP policy by
an existing prctl interface. Why is that not feasible. As you are noting
below...
> 5.containers usually managed by compose software, which treats container as
> base management unit;
..so the compose software can easily start up the workload by using prctl
to disable THP for whatever workloads it is not suitable for.
--
Michal Hocko
SUSE Labs
On Tue, May 10, 2022 at 12:00:04PM +0200, Michal Hocko wrote:
> On Tue 10-05-22 01:43:38, CGEL wrote:
> > On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > > [...]
> > > > > > If there are many containers to run on one host, and some of them have high
> > > > > > performance requirements, administrator could turn on thp for them:
> > > > > > # docker run -it --thp-enabled=always
> > > > > > Then all the processes in those containers will always use thp.
> > > > > > While other containers turn off thp by:
> > > > > > # docker run -it --thp-enabled=never
> > > > >
> > > > > I do not know. The THP config space is already too confusing and complex
> > > > > and this just adds on top. E.g. is the behavior of the knob
> > > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > > child says always? How does the per-application configuration aligns
> > > > > with all that (e.g. memcg policy madivise but application says never via
> > > > > prctl while still uses some madvised - e.g. via library).
> > > > >
> > > >
> > > > The cgroup THP behavior is align to host and totally independent just likes
> > > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > > user to understand or control.
> > >
> > > All controls in cgroup v2 should be hierarchical. This is really
> > > required for a proper delegation semantic.
> > >
> >
> > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > Some distributions like Ubuntu is still using cgroup v1.
>
> cgroup v1 interface is mostly frozen. All new features are added to the
> v2 interface.
>
So what about we add this interface to cgroup v2?
> > > > If memcg policy madivise but application says never, just like host, the result
> > > > is no THP for that application.
> > > >
> > > > > > By doing this we could promote important containers's performance with less
> > > > > > footprint of thp.
> > > > >
> > > > > Do we really want to provide something like THP based QoS? To me it
> > > > > sounds like a bad idea and if the justification is "it might be useful"
> > > > > then I would say no. So you really need to come with a very good usecase
> > > > > to promote this further.
> > > >
> > > > At least on some 5G(communication technology) machine, it's useful to provide
> > > > THP based QoS. Those 5G machine use micro-service software architecture, in
> > > > other words one service application runs in one container.
> > >
> > > I am not really sure I understand. If this is one application per
> > > container (cgroup) then why do you really need per-group setting?
> > > Does the application is a set of different processes which are only very
> > > loosely tight?
> > >
> > For micro-service architecture, the application in one container is not a
> > set of loosely tight processes, it's aim at provide one certain service,
> > so different containers means different service, and different service
> > has different QoS demand.
>
> OK, if they are tightly coupled you could apply the same THP policy by
> an existing prctl interface. Why is that not feasible. As you are noting
> below...
>
> > 5.containers usually managed by compose software, which treats container as
> > base management unit;
>
> ..so the compose software can easily start up the workload by using prctl
> to disable THP for whatever workloads it is not suitable for.
prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
need. If only some containers needs THP, other containers and host do not need
THP. We must set host THP to always first, and call prctl() to close THP for
host tasks and other containers one by one, in this process some tasks that
start before we call prctl() may already use THP with no need.
And compose's semantic treats container as base unit to manage not tasks. See:
https://docs.docker.com/compose/
If we treat container as lightweight virtual machine things may become clearer:
this virtual machine has it's own THP policy being set just likes
/sys/kernel/mm/transparent_hugepage/enabled in host, it has nothing to do
with host or other virtual machine.
> --
> Michal Hocko
> SUSE Labs
On Tue 10-05-22 11:52:51, CGEL wrote:
> On Tue, May 10, 2022 at 12:00:04PM +0200, Michal Hocko wrote:
> > On Tue 10-05-22 01:43:38, CGEL wrote:
> > > On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > > > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > > > [...]
> > > > > > > If there are many containers to run on one host, and some of them have high
> > > > > > > performance requirements, administrator could turn on thp for them:
> > > > > > > # docker run -it --thp-enabled=always
> > > > > > > Then all the processes in those containers will always use thp.
> > > > > > > While other containers turn off thp by:
> > > > > > > # docker run -it --thp-enabled=never
> > > > > >
> > > > > > I do not know. The THP config space is already too confusing and complex
> > > > > > and this just adds on top. E.g. is the behavior of the knob
> > > > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > > > child says always? How does the per-application configuration aligns
> > > > > > with all that (e.g. memcg policy madivise but application says never via
> > > > > > prctl while still uses some madvised - e.g. via library).
> > > > > >
> > > > >
> > > > > The cgroup THP behavior is align to host and totally independent just likes
> > > > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > > > user to understand or control.
> > > >
> > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > required for a proper delegation semantic.
> > > >
> > >
> > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > Some distributions like Ubuntu is still using cgroup v1.
> >
> > cgroup v1 interface is mostly frozen. All new features are added to the
> > v2 interface.
> >
>
> So what about we add this interface to cgroup v2?
Can you come up with a sane hierarchical behavior?
[...]
> > > For micro-service architecture, the application in one container is not a
> > > set of loosely tight processes, it's aim at provide one certain service,
> > > so different containers means different service, and different service
> > > has different QoS demand.
> >
> > OK, if they are tightly coupled you could apply the same THP policy by
> > an existing prctl interface. Why is that not feasible. As you are noting
> > below...
> >
> > > 5.containers usually managed by compose software, which treats container as
> > > base management unit;
> >
> > ..so the compose software can easily start up the workload by using prctl
> > to disable THP for whatever workloads it is not suitable for.
>
> prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
> need. If only some containers needs THP, other containers and host do not need
> THP. We must set host THP to always first, and call prctl() to close THP for
> host tasks and other containers one by one,
It might not be the most elegant solution but it should work.
Maintaining user interfaces for ever has some cost and the THP
configuration space is quite large already. So I would rather not add
more complication in unless that is absolutely necessary.
> in this process some tasks that start before we call prctl() may
> already use THP with no need.
As long as all those processes have a common ancestor I do not see how
that would be possible.
--
Michal Hocko
SUSE Labs
On Tue, May 10, 2022 at 07:47:29PM -0700, Shakeel Butt wrote:
> On Tue, May 10, 2022 at 7:19 PM CGEL <[email protected]> wrote:
> >
> [...]
> > > > >
> > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > required for a proper delegation semantic.
> > > > >
> > > >
> > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > Some distributions like Ubuntu is still using cgroup v1.
> > >
> > > Other than enable flag, how would you handle the defrag flag
> > > hierarchically? It is much more complicated.
> >
> > Refer to memory.swappiness for cgroup, this new interface better be independent.
>
> Let me give my 0.02. I buy the use-case of Admin restricting THPs to
> low priority jobs but I don't think memory controller is the right
> place to enforce that policy. Michal gave one way (prctl()) to enforce
> that policy. Have you explored the BPF way to enforce this policy?
Thanks!
prctl()(at least for the latest version) only support disable THP, it's semantic is
not very perfection. Maybe we could expand the prctl() for THP?
BPF maybe a way to realize more fine-grained THP control. But I think semantic comes
first.
So what about realize three layers of THP controller? All kinds of users maybe satisfy:
Layer 1: all system, realized. see /sys/kernel/mm/transparent_hugepage/enabled.
Layer 2: container/cgroup, unrealized. useful for user who treat container as
lightweight virtual machine, let this overide layer 1.
Layer 3: process, partial realized. see prctl(), let this overide layer 1 & 2.
On Tue, May 10, 2022 at 07:47:29PM -0700, Shakeel Butt wrote:
> On Tue, May 10, 2022 at 7:19 PM CGEL <[email protected]> wrote:
> >
> [...]
> > > > >
> > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > required for a proper delegation semantic.
> > > > >
> > > >
> > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > Some distributions like Ubuntu is still using cgroup v1.
> > >
> > > Other than enable flag, how would you handle the defrag flag
> > > hierarchically? It is much more complicated.
> >
> > Refer to memory.swappiness for cgroup, this new interface better be independent.
>
> Let me give my 0.02. I buy the use-case of Admin restricting THPs to
> low priority jobs but I don't think memory controller is the right
> place to enforce that policy. Michal gave one way (prctl()) to enforce
> that policy. Have you explored the BPF way to enforce this policy?
+1 for bpf
I think these THP hints are too implementation-dependent and unstable to become
a part of cgroup API.
Thanks!
On Tue, May 10, 2022 at 7:19 PM CGEL <[email protected]> wrote:
>
[...]
> > > >
> > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > required for a proper delegation semantic.
> > > >
> > >
> > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > Some distributions like Ubuntu is still using cgroup v1.
> >
> > Other than enable flag, how would you handle the defrag flag
> > hierarchically? It is much more complicated.
>
> Refer to memory.swappiness for cgroup, this new interface better be independent.
Let me give my 0.02. I buy the use-case of Admin restricting THPs to
low priority jobs but I don't think memory controller is the right
place to enforce that policy. Michal gave one way (prctl()) to enforce
that policy. Have you explored the BPF way to enforce this policy?
On Tue, May 10, 2022 at 08:11:16PM -0700, Roman Gushchin wrote:
> On Tue, May 10, 2022 at 07:47:29PM -0700, Shakeel Butt wrote:
> > On Tue, May 10, 2022 at 7:19 PM CGEL <[email protected]> wrote:
> > >
> > [...]
> > > > > >
> > > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > > required for a proper delegation semantic.
> > > > > >
> > > > >
> > > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > > Some distributions like Ubuntu is still using cgroup v1.
> > > >
> > > > Other than enable flag, how would you handle the defrag flag
> > > > hierarchically? It is much more complicated.
> > >
> > > Refer to memory.swappiness for cgroup, this new interface better be independent.
> >
> > Let me give my 0.02. I buy the use-case of Admin restricting THPs to
> > low priority jobs but I don't think memory controller is the right
> > place to enforce that policy. Michal gave one way (prctl()) to enforce
> > that policy. Have you explored the BPF way to enforce this policy?
>
> +1 for bpf
>
> I think these THP hints are too implementation-dependent and unstable to become
> a part of cgroup API.
>
Thanks! If no other suggesting we will submit a bpf version of this patch.
> Thanks!
On Mon, May 9, 2022 at 6:43 PM CGEL <[email protected]> wrote:
>
> On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > [...]
> > > > > If there are many containers to run on one host, and some of them have high
> > > > > performance requirements, administrator could turn on thp for them:
> > > > > # docker run -it --thp-enabled=always
> > > > > Then all the processes in those containers will always use thp.
> > > > > While other containers turn off thp by:
> > > > > # docker run -it --thp-enabled=never
> > > >
> > > > I do not know. The THP config space is already too confusing and complex
> > > > and this just adds on top. E.g. is the behavior of the knob
> > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > child says always? How does the per-application configuration aligns
> > > > with all that (e.g. memcg policy madivise but application says never via
> > > > prctl while still uses some madvised - e.g. via library).
> > > >
> > >
> > > The cgroup THP behavior is align to host and totally independent just likes
> > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > user to understand or control.
> >
> > All controls in cgroup v2 should be hierarchical. This is really
> > required for a proper delegation semantic.
> >
>
> Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> Some distributions like Ubuntu is still using cgroup v1.
Other than enable flag, how would you handle the defrag flag
hierarchically? It is much more complicated.
>
> > > If memcg policy madivise but application says never, just like host, the result
> > > is no THP for that application.
> > >
> > > > > By doing this we could promote important containers's performance with less
> > > > > footprint of thp.
> > > >
> > > > Do we really want to provide something like THP based QoS? To me it
> > > > sounds like a bad idea and if the justification is "it might be useful"
> > > > then I would say no. So you really need to come with a very good usecase
> > > > to promote this further.
> > >
> > > At least on some 5G(communication technology) machine, it's useful to provide
> > > THP based QoS. Those 5G machine use micro-service software architecture, in
> > > other words one service application runs in one container.
> >
> > I am not really sure I understand. If this is one application per
> > container (cgroup) then why do you really need per-group setting?
> > Does the application is a set of different processes which are only very
> > loosely tight?
> >
> For micro-service architecture, the application in one container is not a
> set of loosely tight processes, it's aim at provide one certain service,
> so different containers means different service, and different service
> has different QoS demand.
>
> The reason why we need per-group(per-container) setting is because most
> container are managed by compose software, the compose software provide
> UI to decide how to run a container(likes setting swappiness value). For
> example the docker compose:
> https://docs.docker.com/compose/#compose-v2-and-the-new-docker-compose-command
>
> To make it clearer, I try to make a summary for why container needs this patch:
> 1.one machine can run different containers;
> 2.for some scenario, container runs only one service inside(can be only one
> application);
> 3.different containers provide different services, different services have
> different QoS demands;
> 4.THP has big influence on QoS. It's fast for memory access, but eat more
> memory;
I have been involved in this kind of topic discussion offline a couple
of times. But TBH I don't see how you could achieve QoS by this flag.
THP allocation is *NOT* guaranteed. And the overhead may be quite
high. It depends on how fragmented the system is.
> 5.containers usually managed by compose software, which treats container as
> base management unit;
> 6.this patch provide cgroup THP controller, which can be a method to adjust
> container memory QoS.
>
> > > Container becomes
> > > the suitable management unit but not the whole host. And some performance
> > > sensitive containers desiderate THP to provide low latency communication.
> > > But if we use THP with 'always', it will consume more memory(on our machine
> > > that is about 10% of total memory). And unnecessary huge pages will increase
> > > memory pressure, add latency for minor pages faults, and add overhead when
> > > splitting huge pages or coalescing normal sized pages into huge pages.
> >
> > It is still not really clear to me how do you achieve that the whole
> > workload in the said container has the same THP requirements.
> > --
> > Michal Hocko
> > SUSE Labs
On Wed 11-05-22 01:59:52, CGEL wrote:
> On Tue, May 10, 2022 at 03:36:34PM +0200, Michal Hocko wrote:
[...]
> > Can you come up with a sane hierarchical behavior?
> >
>
> I think this new interface better be independent not hierarchical anyway. Especially
> when we treat container as lightweight virtual machine.
I suspect you are focusing too much on your usecase and do not realize
wider consequences of this being an user interface that still has to be
sensible for other usecases. Take a delagation of the control to
subgroups as an example. If this is a per memcg knob (like swappiness)
then children can override parent's THP policy. This might be a less of
the deal for swappiness because the anon/file reclaim balancing should
be mostly an internal thing. But THP policy is different because it has
other effects to workloads running outside of the said cgroup - higher
memory demand, higher contention for high-order memory etc.
I do not really see how this could be a sensible per-memcg policy
without being fully hierarchical.
>
> > [...]
> > > > > For micro-service architecture, the application in one container is not a
> > > > > set of loosely tight processes, it's aim at provide one certain service,
> > > > > so different containers means different service, and different service
> > > > > has different QoS demand.
> > > >
> > > > OK, if they are tightly coupled you could apply the same THP policy by
> > > > an existing prctl interface. Why is that not feasible. As you are noting
> > > > below...
> > > >
> > > > > 5.containers usually managed by compose software, which treats container as
> > > > > base management unit;
> > > >
> > > > ..so the compose software can easily start up the workload by using prctl
> > > > to disable THP for whatever workloads it is not suitable for.
> > >
> > > prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
> > > need. If only some containers needs THP, other containers and host do not need
> > > THP. We must set host THP to always first, and call prctl() to close THP for
> > > host tasks and other containers one by one,
> >
> > It might not be the most elegant solution but it should work.
>
> So you agree it's reasonable to set THP policy for process in container, right?
Yes, like in any other processes.
> If so, IMHO, when there are thousands of processes launch and die on the machine,
> it will be horrible to do so by calling prctl(), I don't see the reasonability.
Could you be more specific? The usual prctl use would be normally
handled by the launcher and rely on the per-process policy to be
inherited down the road.
--
Michal Hocko
SUSE Labs
On Tue, May 10, 2022 at 03:36:34PM +0200, Michal Hocko wrote:
> On Tue 10-05-22 11:52:51, CGEL wrote:
> > On Tue, May 10, 2022 at 12:00:04PM +0200, Michal Hocko wrote:
> > > On Tue 10-05-22 01:43:38, CGEL wrote:
> > > > On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > > > > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > > > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > > > > [...]
> > > > > > > > If there are many containers to run on one host, and some of them have high
> > > > > > > > performance requirements, administrator could turn on thp for them:
> > > > > > > > # docker run -it --thp-enabled=always
> > > > > > > > Then all the processes in those containers will always use thp.
> > > > > > > > While other containers turn off thp by:
> > > > > > > > # docker run -it --thp-enabled=never
> > > > > > >
> > > > > > > I do not know. The THP config space is already too confusing and complex
> > > > > > > and this just adds on top. E.g. is the behavior of the knob
> > > > > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > > > > child says always? How does the per-application configuration aligns
> > > > > > > with all that (e.g. memcg policy madivise but application says never via
> > > > > > > prctl while still uses some madvised - e.g. via library).
> > > > > > >
> > > > > >
> > > > > > The cgroup THP behavior is align to host and totally independent just likes
> > > > > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > > > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > > > > user to understand or control.
> > > > >
> > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > required for a proper delegation semantic.
> > > > >
> > > >
> > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > Some distributions like Ubuntu is still using cgroup v1.
> > >
> > > cgroup v1 interface is mostly frozen. All new features are added to the
> > > v2 interface.
> > >
> >
> > So what about we add this interface to cgroup v2?
>
> Can you come up with a sane hierarchical behavior?
>
I think this new interface better be independent not hierarchical anyway. Especially
when we treat container as lightweight virtual machine.
> [...]
> > > > For micro-service architecture, the application in one container is not a
> > > > set of loosely tight processes, it's aim at provide one certain service,
> > > > so different containers means different service, and different service
> > > > has different QoS demand.
> > >
> > > OK, if they are tightly coupled you could apply the same THP policy by
> > > an existing prctl interface. Why is that not feasible. As you are noting
> > > below...
> > >
> > > > 5.containers usually managed by compose software, which treats container as
> > > > base management unit;
> > >
> > > ..so the compose software can easily start up the workload by using prctl
> > > to disable THP for whatever workloads it is not suitable for.
> >
> > prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
> > need. If only some containers needs THP, other containers and host do not need
> > THP. We must set host THP to always first, and call prctl() to close THP for
> > host tasks and other containers one by one,
>
> It might not be the most elegant solution but it should work.
So you agree it's reasonable to set THP policy for process in container, right?
If so, IMHO, when there are thousands of processes launch and die on the machine,
it will be horrible to do so by calling prctl(), I don't see the reasonability.
> Maintaining user interfaces for ever has some cost and the THP
> configuration space is quite large already. So I would rather not add
> more complication in unless that is absolutely necessary.
>
> > in this process some tasks that start before we call prctl() may
> > already use THP with no need.
>
> As long as all those processes have a common ancestor I do not see how
> that would be possible.
>
For example:
1) userspace set THP policy to always
2) then one unrelated processe A may launch automatic by a script maybe
3) call prctl() to disable THP for A
process A may already use THP with no need.
> --
> Michal Hocko
> SUSE Labs
On Wed, May 11, 2022 at 09:21:53AM +0200, Michal Hocko wrote:
> On Wed 11-05-22 01:59:52, CGEL wrote:
> > On Tue, May 10, 2022 at 03:36:34PM +0200, Michal Hocko wrote:
> [...]
> > > Can you come up with a sane hierarchical behavior?
> > >
> >
> > I think this new interface better be independent not hierarchical anyway. Especially
> > when we treat container as lightweight virtual machine.
>
> I suspect you are focusing too much on your usecase and do not realize
> wider consequences of this being an user interface that still has to be
> sensible for other usecases. Take a delagation of the control to
> subgroups as an example. If this is a per memcg knob (like swappiness)
> then children can override parent's THP policy. This might be a less of
> the deal for swappiness because the anon/file reclaim balancing should
> be mostly an internal thing. But THP policy is different because it has
> other effects to workloads running outside of the said cgroup - higher
> memory demand, higher contention for high-order memory etc.
>
Higher memory demand will be limited by memsw.limit_in_bytes right?
And cgroup really cares about high-order memory usage? At least for
now there are no cgroup limit for this.
> I do not really see how this could be a sensible per-memcg policy
> without being fully hierarchical.
>
Thanks to your patient discuss, as Roman said, I will try to realize this
with bpf.
> >
> > > [...]
> > > > > > For micro-service architecture, the application in one container is not a
> > > > > > set of loosely tight processes, it's aim at provide one certain service,
> > > > > > so different containers means different service, and different service
> > > > > > has different QoS demand.
> > > > >
> > > > > OK, if they are tightly coupled you could apply the same THP policy by
> > > > > an existing prctl interface. Why is that not feasible. As you are noting
> > > > > below...
> > > > >
> > > > > > 5.containers usually managed by compose software, which treats container as
> > > > > > base management unit;
> > > > >
> > > > > ..so the compose software can easily start up the workload by using prctl
> > > > > to disable THP for whatever workloads it is not suitable for.
> > > >
> > > > prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
> > > > need. If only some containers needs THP, other containers and host do not need
> > > > THP. We must set host THP to always first, and call prctl() to close THP for
> > > > host tasks and other containers one by one,
> > >
> > > It might not be the most elegant solution but it should work.
> >
> > So you agree it's reasonable to set THP policy for process in container, right?
>
> Yes, like in any other processes.
>
> > If so, IMHO, when there are thousands of processes launch and die on the machine,
> > it will be horrible to do so by calling prctl(), I don't see the reasonability.
>
> Could you be more specific? The usual prctl use would be normally
> handled by the launcher and rely on the per-process policy to be
> inherited down the road.
>
> --
> Michal Hocko
> SUSE Labs
On Tue, May 10, 2022 at 12:34:20PM -0700, Yang Shi wrote:
> On Mon, May 9, 2022 at 6:43 PM CGEL <[email protected]> wrote:
> >
> > On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > > [...]
> > > > > > If there are many containers to run on one host, and some of them have high
> > > > > > performance requirements, administrator could turn on thp for them:
> > > > > > # docker run -it --thp-enabled=always
> > > > > > Then all the processes in those containers will always use thp.
> > > > > > While other containers turn off thp by:
> > > > > > # docker run -it --thp-enabled=never
> > > > >
> > > > > I do not know. The THP config space is already too confusing and complex
> > > > > and this just adds on top. E.g. is the behavior of the knob
> > > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > > child says always? How does the per-application configuration aligns
> > > > > with all that (e.g. memcg policy madivise but application says never via
> > > > > prctl while still uses some madvised - e.g. via library).
> > > > >
> > > >
> > > > The cgroup THP behavior is align to host and totally independent just likes
> > > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > > user to understand or control.
> > >
> > > All controls in cgroup v2 should be hierarchical. This is really
> > > required for a proper delegation semantic.
> > >
> >
> > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > Some distributions like Ubuntu is still using cgroup v1.
>
> Other than enable flag, how would you handle the defrag flag
> hierarchically? It is much more complicated.
Refer to memory.swappiness for cgroup, this new interface better be independent.
> >
> > > > If memcg policy madivise but application says never, just like host, the result
> > > > is no THP for that application.
> > > >
> > > > > > By doing this we could promote important containers's performance with less
> > > > > > footprint of thp.
> > > > >
> > > > > Do we really want to provide something like THP based QoS? To me it
> > > > > sounds like a bad idea and if the justification is "it might be useful"
> > > > > then I would say no. So you really need to come with a very good usecase
> > > > > to promote this further.
> > > >
> > > > At least on some 5G(communication technology) machine, it's useful to provide
> > > > THP based QoS. Those 5G machine use micro-service software architecture, in
> > > > other words one service application runs in one container.
> > >
> > > I am not really sure I understand. If this is one application per
> > > container (cgroup) then why do you really need per-group setting?
> > > Does the application is a set of different processes which are only very
> > > loosely tight?
> > >
> > For micro-service architecture, the application in one container is not a
> > set of loosely tight processes, it's aim at provide one certain service,
> > so different containers means different service, and different service
> > has different QoS demand.
> >
> > The reason why we need per-group(per-container) setting is because most
> > container are managed by compose software, the compose software provide
> > UI to decide how to run a container(likes setting swappiness value). For
> > example the docker compose:
> > https://docs.docker.com/compose/#compose-v2-and-the-new-docker-compose-command
> >
> > To make it clearer, I try to make a summary for why container needs this patch:
> > 1.one machine can run different containers;
> > 2.for some scenario, container runs only one service inside(can be only one
> > application);
> > 3.different containers provide different services, different services have
> > different QoS demands;
> > 4.THP has big influence on QoS. It's fast for memory access, but eat more
> > memory;
>
> I have been involved in this kind of topic discussion offline a couple
> of times. But TBH I don't see how you could achieve QoS by this flag.
> THP allocation is *NOT* guaranteed. And the overhead may be quite
> high. It depends on how fragmented the system is.
For THP, the word 'QoS' maybe too absolute, so let's describe it in the way why user
need THP: seeking for better memory performance.
Yes as you said THP may be quite overhead, and madvise() may not be suitable some time
(see PR_SET_THP_DISABLE https://man7.org/linux/man-pages/man2/prctl.2.html).
So I think this is just the reason why we need the patch: give user a method to use
THP with more precise range(only the performance sensitive containers) and reduce
overhead.
>
> > 5.containers usually managed by compose software, which treats container as
> > base management unit;
> > 6.this patch provide cgroup THP controller, which can be a method to adjust
> > container memory QoS.
> >
> > > > Container becomes
> > > > the suitable management unit but not the whole host. And some performance
> > > > sensitive containers desiderate THP to provide low latency communication.
> > > > But if we use THP with 'always', it will consume more memory(on our machine
> > > > that is about 10% of total memory). And unnecessary huge pages will increase
> > > > memory pressure, add latency for minor pages faults, and add overhead when
> > > > splitting huge pages or coalescing normal sized pages into huge pages.
> > >
> > > It is still not really clear to me how do you achieve that the whole
> > > workload in the said container has the same THP requirements.
> > > --
> > > Michal Hocko
> > > SUSE Labs
On Tue, May 10, 2022 at 03:36:34PM +0200, Michal Hocko wrote:
> On Tue 10-05-22 11:52:51, CGEL wrote:
> > On Tue, May 10, 2022 at 12:00:04PM +0200, Michal Hocko wrote:
> > > On Tue 10-05-22 01:43:38, CGEL wrote:
> > > > On Mon, May 09, 2022 at 01:48:39PM +0200, Michal Hocko wrote:
> > > > > On Mon 09-05-22 11:26:43, CGEL wrote:
> > > > > > On Mon, May 09, 2022 at 12:00:28PM +0200, Michal Hocko wrote:
> > > > > > > On Sat 07-05-22 02:05:25, CGEL wrote:
> > > > > > > [...]
> > > > > > > > If there are many containers to run on one host, and some of them have high
> > > > > > > > performance requirements, administrator could turn on thp for them:
> > > > > > > > # docker run -it --thp-enabled=always
> > > > > > > > Then all the processes in those containers will always use thp.
> > > > > > > > While other containers turn off thp by:
> > > > > > > > # docker run -it --thp-enabled=never
> > > > > > >
> > > > > > > I do not know. The THP config space is already too confusing and complex
> > > > > > > and this just adds on top. E.g. is the behavior of the knob
> > > > > > > hierarchical? What is the policy if parent memcg says madivise while
> > > > > > > child says always? How does the per-application configuration aligns
> > > > > > > with all that (e.g. memcg policy madivise but application says never via
> > > > > > > prctl while still uses some madvised - e.g. via library).
> > > > > > >
> > > > > >
> > > > > > The cgroup THP behavior is align to host and totally independent just likes
> > > > > > /sys/fs/cgroup/memory.swappiness. That means if one cgroup config 'always'
> > > > > > for thp, it has no matter with host or other cgroup. This make it simple for
> > > > > > user to understand or control.
> > > > >
> > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > required for a proper delegation semantic.
> > > > >
> > > >
> > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > Some distributions like Ubuntu is still using cgroup v1.
> > >
> > > cgroup v1 interface is mostly frozen. All new features are added to the
> > > v2 interface.
> > >
> >
> > So what about we add this interface to cgroup v2?
>
> Can you come up with a sane hierarchical behavior?
>
> [...]
> > > > For micro-service architecture, the application in one container is not a
> > > > set of loosely tight processes, it's aim at provide one certain service,
> > > > so different containers means different service, and different service
> > > > has different QoS demand.
> > >
> > > OK, if they are tightly coupled you could apply the same THP policy by
> > > an existing prctl interface. Why is that not feasible. As you are noting
> > > below...
> > >
> > > > 5.containers usually managed by compose software, which treats container as
> > > > base management unit;
> > >
> > > ..so the compose software can easily start up the workload by using prctl
> > > to disable THP for whatever workloads it is not suitable for.
> >
> > prctl(PR_SET_THP_DISABLE..) can not be elegance to support the semantic we
> > need. If only some containers needs THP, other containers and host do not need
> > THP. We must set host THP to always first, and call prctl() to close THP for
> > host tasks and other containers one by one,
>
> It might not be the most elegant solution but it should work.
> Maintaining user interfaces for ever has some cost and the THP
> configuration space is quite large already. So I would rather not add
> more complication in unless that is absolutely necessary.
>
By the way, should we let prctl() support PR_SET_THP_ALWAYS? Just likes
PR_TASK_PERF_EVENTS_DISABLE and PR_TASK_PERF_EVENTS_ENABLE. This would
make it simpler to let certain process use THP while others not use.
> > in this process some tasks that start before we call prctl() may
> > already use THP with no need.
>
> As long as all those processes have a common ancestor I do not see how
> that would be possible.
>
> --
> Michal Hocko
> SUSE Labs
On Wed, May 11, 2022 at 03:31:00AM +0000, CGEL wrote:
> On Tue, May 10, 2022 at 08:11:16PM -0700, Roman Gushchin wrote:
> > On Tue, May 10, 2022 at 07:47:29PM -0700, Shakeel Butt wrote:
> > > On Tue, May 10, 2022 at 7:19 PM CGEL <[email protected]> wrote:
> > > >
> > > [...]
> > > > > > >
> > > > > > > All controls in cgroup v2 should be hierarchical. This is really
> > > > > > > required for a proper delegation semantic.
> > > > > > >
> > > > > >
> > > > > > Could we align to the semantic of /sys/fs/cgroup/memory.swappiness?
> > > > > > Some distributions like Ubuntu is still using cgroup v1.
> > > > >
> > > > > Other than enable flag, how would you handle the defrag flag
> > > > > hierarchically? It is much more complicated.
> > > >
> > > > Refer to memory.swappiness for cgroup, this new interface better be independent.
> > >
> > > Let me give my 0.02. I buy the use-case of Admin restricting THPs to
> > > low priority jobs but I don't think memory controller is the right
> > > place to enforce that policy. Michal gave one way (prctl()) to enforce
> > > that policy. Have you explored the BPF way to enforce this policy?
> >
> > +1 for bpf
> >
> > I think these THP hints are too implementation-dependent and unstable to become
> > a part of cgroup API.
> >
>
> Thanks! If no other suggesting we will submit a bpf version of this patch.
>
What is your proposal for BPF? How do you intend to add attach points
(attach_type) for policy? Is it still going to be per cgroup?
Balbir Singh