2021-07-22 15:22:41

by brookxu.cn

[permalink] [raw]
Subject: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit

From: Chunguang Xu <[email protected]>

Since the global open files are limited, in order to avoid the
abnormal behavior of some containers from generating too many
files, causing other containers to be unavailable, we need to
limit the open files of some containers.

v2: fix compile error while CONFIG_CGROUP_MISC not set.

Signed-off-by: Chunguang Xu <[email protected]>
Reported-by: kernel test robot <[email protected]>
---
fs/file_table.c | 28 ++++++++++++++++++++++++++--
include/linux/fs.h | 4 +++-
include/linux/misc_cgroup.h | 1 +
kernel/cgroup/misc.c | 1 +
4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 45437f8e1003..5957b2de9701 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -29,6 +29,7 @@
#include <linux/swap.h>

#include <linux/atomic.h>
+#include <linux/misc_cgroup.h>

#include "internal.h"

@@ -53,8 +54,16 @@ static void file_free_rcu(struct rcu_head *head)
static inline void file_free(struct file *f)
{
security_file_free(f);
- if (!(f->f_mode & FMODE_NOACCOUNT))
+ if (!(f->f_mode & FMODE_NOACCOUNT)) {
+#ifdef CONFIG_CGROUP_MISC
+ struct misc_cg *misc_cg = css_misc(f->f_css);
+
+ misc_cg_uncharge(MISC_CG_RES_NOFILE, misc_cg, 1);
+ put_misc_cg(misc_cg);
+#endif
+
percpu_counter_dec(&nr_files);
+ }
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}

@@ -148,8 +157,22 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
}

f = __alloc_file(flags, cred);
- if (!IS_ERR(f))
+ if (!IS_ERR(f)) {
+#ifdef CONFIG_CGROUP_MISC
+ struct misc_cg *misc_cg = get_current_misc_cg();
+ int ret;
+
+ ret = misc_cg_try_charge(MISC_CG_RES_NOFILE, misc_cg, 1);
+ if (ret < 0) {
+ file_free(f);
+ put_misc_cg(misc_cg);
+ return ERR_PTR(-ENFILE);
+ }
+ f->f_css = &misc_cg->css;
+#endif
+
percpu_counter_inc(&nr_files);
+ }

return f;

@@ -397,4 +420,5 @@ void __init files_maxfiles_init(void)
n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

files_stat.max_files = max_t(unsigned long, n, NR_FILE);
+ misc_cg_set_capacity(MISC_CG_RES_NOFILE, files_stat.max_files);
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fad6663cd1b0..9ef3dd579ed6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -947,7 +947,9 @@ struct file {
#endif
/* needed for tty driver, and maybe others */
void *private_data;
-
+#ifdef CONFIG_CGROUP_MISC
+ struct cgroup_subsys_state *f_css;
+#endif
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct hlist_head *f_ep;
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index da2367e2ac1e..8450a5e66de0 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -18,6 +18,7 @@ enum misc_res_type {
/* AMD SEV-ES ASIDs resource */
MISC_CG_RES_SEV_ES,
#endif
+ MISC_CG_RES_NOFILE,
MISC_CG_RES_TYPES
};

diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ec02d963cad1..5d51b8eeece6 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -24,6 +24,7 @@ static const char *const misc_res_name[] = {
/* AMD SEV-ES ASIDs resource */
"sev_es",
#endif
+ "nofile"
};

/* Root misc cgroup */
--
2.30.0


2021-07-22 15:24:03

by brookxu.cn

[permalink] [raw]
Subject: [RFC PATCH v2 2/3] misc_cgroup: add failcnt counter

From: Chunguang Xu <[email protected]>

Instead of printing logs, we should probably track failures through
a failcnt counter, similar to mem_cgroup.

Signed-off-by: Chunguang Xu <[email protected]>
---
include/linux/misc_cgroup.h | 1 +
kernel/cgroup/misc.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+)

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 8450a5e66de0..dd1a786f39b8 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -37,6 +37,7 @@ struct misc_cg;
struct misc_res {
unsigned long max;
atomic_long_t usage;
+ atomic_long_t failcnt;
bool failed;
};

diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 5d51b8eeece6..7c568b619f82 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -165,6 +165,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
pr_cont("\n");
res->failed = true;
}
+ atomic_long_inc(&res->failcnt);
ret = -EBUSY;
goto err_charge;
}
@@ -312,6 +313,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
return 0;
}

+/**
+ * misc_cg_failcnt_show() - Show the fail count of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_failcnt_show(struct seq_file *sf, void *v)
+{
+ int i;
+ unsigned long failcnt;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ failcnt = atomic_long_read(&cg->res[i].failcnt);
+ if (READ_ONCE(misc_res_capacity[i]) || failcnt)
+ seq_printf(sf, "%s %lu\n", misc_res_name[i], failcnt);
+ }
+
+ return 0;
+}
+
/**
* misc_cg_capacity_show() - Show the total capacity of misc res on the host.
* @sf: Interface file
@@ -349,6 +373,11 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_current_show,
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "failcnt",
+ .seq_show = misc_cg_failcnt_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{
.name = "capacity",
.seq_show = misc_cg_capacity_show,
@@ -383,6 +412,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
WRITE_ONCE(cg->res[i].max, MAX_NUM);
atomic_long_set(&cg->res[i].usage, 0);
+ atomic_long_set(&cg->res[i].failcnt, 0);
}

return &cg->css;
--
2.30.0

2021-07-22 15:24:16

by brookxu.cn

[permalink] [raw]
Subject: [RFC PATCH v2 3/3] misc_cgroup: delete failed logs to avoid log flooding

From: Chunguang Xu <[email protected]>

Since the upper-level logic will constantly retry when it fails, in
high-stress scenarios, a large number of failure logs may affect
performance. Therefore, we can replace it with the failcnt counter.

Signed-off-by: Chunguang Xu <[email protected]>
---
kernel/cgroup/misc.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 7c568b619f82..b7de0fafa48a 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -159,8 +159,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
if (!res->failed) {
- pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
- misc_res_name[type]);
pr_cont_cgroup_path(i->css.cgroup);
pr_cont("\n");
res->failed = true;
--
2.30.0

2021-07-26 21:28:49

by Tejun Heo

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit

On Thu, Jul 22, 2021 at 11:20:17PM +0800, brookxu wrote:
> From: Chunguang Xu <[email protected]>
>
> Since the global open files are limited, in order to avoid the
> abnormal behavior of some containers from generating too many
> files, causing other containers to be unavailable, we need to
> limit the open files of some containers.
>
> v2: fix compile error while CONFIG_CGROUP_MISC not set.
>
> Signed-off-by: Chunguang Xu <[email protected]>
> Reported-by: kernel test robot <[email protected]>

This is different from pid in that there's no actual limit on how many open
files there can be in the system other than the total amount of available
memory. I don't see why this would need a separate limit outside of memory
control. A couple machines I looked at all have file-max at LONG_MAX by
default too.

Thanks.

--
tejun

2021-07-27 03:19:07

by brookxu.cn

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit


Thanks for your time.

Tejun Heo wrote on 2021/7/27 5:27:
> On Thu, Jul 22, 2021 at 11:20:17PM +0800, brookxu wrote:
>> From: Chunguang Xu <[email protected]>
>>
>> Since the global open files are limited, in order to avoid the
>> abnormal behavior of some containers from generating too many
>> files, causing other containers to be unavailable, we need to
>> limit the open files of some containers.
>>
>> v2: fix compile error while CONFIG_CGROUP_MISC not set.
>>
>> Signed-off-by: Chunguang Xu <[email protected]>
>> Reported-by: kernel test robot <[email protected]>
>
> This is different from pid in that there's no actual limit on how many open
> files there can be in the system other than the total amount of available
> memory. I don't see why this would need a separate limit outside of memory
> control. A couple machines I looked at all have file-max at LONG_MAX by
> default too.

According to files_maxfiles_init(), we only allow about 10% of free memory to
create filps, and each filp occupies about 1K of cache. In this way, on a 16G
memory machine, the maximum usable filp is about 1,604,644. In general
scenarios, this may not be a big problem, but if the task is abnormal, it will
very likely become a bottleneck and affect other modules.

> Thanks.
>

2021-07-27 16:33:20

by Tejun Heo

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit

Hello,

On Tue, Jul 27, 2021 at 11:18:00AM +0800, brookxu wrote:
> According to files_maxfiles_init(), we only allow about 10% of free memory to
> create filps, and each filp occupies about 1K of cache. In this way, on a 16G
> memory machine, the maximum usable filp is about 1,604,644. In general
> scenarios, this may not be a big problem, but if the task is abnormal, it will
> very likely become a bottleneck and affect other modules.

Yeah but that can be configured trivially through sysfs. The reason why the
default limit is lowered is because we wanna prevent a part of system to
consume all the memory through fds. With cgroups, we already have that
protection and at least some systems already configure file-max to maximum,
so I don't see a point in adding another interface to subdivide the
artificial limit.

Thanks.

--
tejun

2021-07-28 03:18:47

by brookxu.cn

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit



Tejun Heo wrote on 2021/7/28 12:32 上午:
> Hello,
>
> On Tue, Jul 27, 2021 at 11:18:00AM +0800, brookxu wrote:
>> According to files_maxfiles_init(), we only allow about 10% of free memory to
>> create filps, and each filp occupies about 1K of cache. In this way, on a 16G
>> memory machine, the maximum usable filp is about 1,604,644. In general
>> scenarios, this may not be a big problem, but if the task is abnormal, it will
>> very likely become a bottleneck and affect other modules.
>
> Yeah but that can be configured trivially through sysfs. The reason why the
> default limit is lowered is because we wanna prevent a part of system to
> consume all the memory through fds. With cgroups, we already have that
> protection and at least some systems already configure file-max to maximum,
> so I don't see a point in adding another interface to subdivide the
> artificial limit.
>

Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
to the actual load of the machine, not for abnormal tasks. Another problem is that in
practical applications, kmem_limit will cause some minor problems. In many cases,
kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
may cause files_cache to be out of control. In this case, if file-max is set to MAX,
we may have a risk in the abnormal scene, which prevents us from recovering from the
abnormal scene. Maybe I missed something.

> Thanks.
>

2021-07-28 07:43:30

by Tejun Heo

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit

On Wed, Jul 28, 2021 at 11:17:08AM +0800, brookxu wrote:
> Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
> to the actual load of the machine, not for abnormal tasks. Another problem is that in
> practical applications, kmem_limit will cause some minor problems. In many cases,
> kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
> may cause files_cache to be out of control. In this case, if file-max is set to MAX,
> we may have a risk in the abnormal scene, which prevents us from recovering from the
> abnormal scene. Maybe I missed something.

Kmem control is always on in cgroup2 and has been in wide production use for
years now. If there are problems with it, we need to fix them. That really
doesn't justify adding another feature.

Thanks.

--
tejun

2021-07-28 09:48:22

by brookxu.cn

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit



Tejun Heo wrote on 2021/7/28 3:41 下午:
> On Wed, Jul 28, 2021 at 11:17:08AM +0800, brookxu wrote:
>> Yeah we can adjust file-max through sysctl, but in many cases we adjust it according
>> to the actual load of the machine, not for abnormal tasks. Another problem is that in
>> practical applications, kmem_limit will cause some minor problems. In many cases,
>> kmem_limit is disabled. Limit_in_bytes mainly counts user pages and pagecache, which
>> may cause files_cache to be out of control. In this case, if file-max is set to MAX,
>> we may have a risk in the abnormal scene, which prevents us from recovering from the
>> abnormal scene. Maybe I missed something.
>
> Kmem control is always on in cgroup2 and has been in wide production use for
> years now. If there are problems with it, we need to fix them. That really
> doesn't justify adding another feature.

But considering stability issues(k8s), There are still many production environments use
cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
of kmem, this feature can also prevent the abnormal open behavior from making the entire
container unavailable? but I currently do not have this scenario.

Thanks for your time.

> Thanks.
>

2021-07-28 15:40:18

by Tejun Heo

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit

Hello,

On Wed, Jul 28, 2021 at 05:47:05PM +0800, brookxu wrote:
> But considering stability issues(k8s), There are still many production environments use
> cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
> of kmem, this feature can also prevent the abnormal open behavior from making the entire
> container unavailable? but I currently do not have this scenario.

Now we are repeating the same points. This simply doesn't justify adding a
user-facing feature that we have to maintain for eternity.

Thanks.

--
tejun

2021-07-29 06:39:07

by brookxu.cn

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] misc_cgroup: add support for nofile limit


Thanks for your time.

Tejun Heo wrote on 2021/7/28 11:38 下午:
> Hello,
>
> On Wed, Jul 28, 2021 at 05:47:05PM +0800, brookxu wrote:
>> But considering stability issues(k8s), There are still many production environments use
>> cgroup v1 without kmem. If kmem is enabled, due to the relatively large granularity
>> of kmem, this feature can also prevent the abnormal open behavior from making the entire
>> container unavailable? but I currently do not have this scenario.
>
> Now we are repeating the same points. This simply doesn't justify adding a
> user-facing feature that we have to maintain for eternity.

Ok, thanks you for your patient reply.

> Thanks.
>