From: Alexey Gladkov <[email protected]>
The /proc/meminfo contains information regardless of the cgroups
restrictions. This file is still widely used [1]. This means that all
these programs will not work correctly inside container [2][3][4]. Some
programs try to respect the cgroups limits, but not all of them
implement support for all cgroup versions [5].
Correct information can be obtained from cgroups, but this requires the
cgroups to be available inside container and the correct version of
cgroups to be supported.
There is lxcfs [6] that emulates /proc/meminfo using fuse to provide
information regarding cgroups. This patch can help them.
This patch adds /proc/self/meminfo that contains a subset of
/proc/meminfo respecting cgroup restrictions.
We cannot just create /proc/self/meminfo and make a symlink at the old
location because this will break the existing apparmor rules [7].
Therefore, the patch adds a separate file with the same format.
[1] https://codesearch.debian.net/search?q=%2Fproc%2Fmeminfo
[2] https://sources.debian.org/src/erlang/1:23.2.6+dfsg-1/lib/os_mon/c_src/memsup.c#L300
[3] https://sources.debian.org/src/p7zip/16.02+dfsg-8/CPP/Windows/System.cpp/#L103
[4] https://sources.debian.org/src/systemd/247.3-5/src/oom/oomd.c/#L138
[5] https://sources.debian.org/src/nodejs/12.21.0%7Edfsg-4/deps/uv/src/unix/linux-core.c/#L1059
[6] https://linuxcontainers.org/lxcfs/
[7] https://gitlab.com/apparmor/apparmor/-/blob/master/profiles/apparmor.d/abstractions/base#L98
Signed-off-by: Alexey Gladkov <[email protected]>
---
fs/proc/base.c | 2 +
fs/proc/internal.h | 6 ++
fs/proc/meminfo.c | 160 +++++++++++++++++++++++--------------
include/linux/memcontrol.h | 2 +
include/linux/mm.h | 15 ++++
mm/memcontrol.c | 80 +++++++++++++++++++
mm/page_alloc.c | 28 ++++---
7 files changed, 222 insertions(+), 71 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58bbf334265b..e95837cf713f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+ ONE("meminfo", S_IRUGO, proc_meminfo_show),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
#endif
+ ONE("meminfo", S_IRUGO, proc_meminfo_show),
};
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 03415f3fb3a8..a6e8540afbd3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -241,6 +241,12 @@ extern int proc_net_init(void);
static inline int proc_net_init(void) { return 0; }
#endif
+/*
+ * meminfo.c
+ */
+extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk);
+
/*
* proc_self.c
*/
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 6fa761c9cc78..3587a79d4b96 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -16,6 +16,9 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#ifdef CONFIG_MEMCG
+#include <linux/memcontrol.h>
+#endif
#include <asm/page.h>
#include "internal.h"
@@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
{
}
+static void proc_fill_meminfo(struct meminfo *mi)
+{
+ int lru;
+ long cached;
+
+ si_meminfo(&mi->si);
+ si_swapinfo(&mi->si);
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram;
+ if (cached < 0)
+ cached = 0;
+
+ mi->cached = cached;
+ mi->swapcached = total_swapcache_pages();
+ mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+ mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
+ mi->anon_pages = global_node_page_state(NR_ANON_MAPPED);
+ mi->mapped = global_node_page_state(NR_FILE_MAPPED);
+ mi->nr_pagetable = global_node_page_state(NR_PAGETABLE);
+ mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY);
+ mi->writeback_pages = global_node_page_state(NR_WRITEBACK);
+}
+
+#ifdef CONFIG_MEMCG
+static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
+{
+ mem_fill_meminfo(mi, task);
+}
+#else
+static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
+{
+ proc_fill_meminfo(mi);
+}
+#endif
+
static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
{
seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
seq_write(m, " kB\n", 4);
}
+static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
+{
+ show_val_kb(m, "MemTotal: ", mi->si.totalram);
+ show_val_kb(m, "MemFree: ", mi->si.freeram);
+ show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi));
+ show_val_kb(m, "Buffers: ", mi->si.bufferram);
+ show_val_kb(m, "Cached: ", mi->cached);
+ show_val_kb(m, "SwapCached: ", mi->swapcached);
+ show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]);
+ show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]);
+ show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]);
+ show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]);
+
+#ifdef CONFIG_HIGHMEM
+ show_val_kb(m, "HighTotal: ", mi->si.totalhigh);
+ show_val_kb(m, "HighFree: ", mi->si.freehigh);
+ show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh);
+ show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh);
+#endif
+
+ show_val_kb(m, "SwapTotal: ", mi->si.totalswap);
+ show_val_kb(m, "SwapFree: ", mi->si.freeswap);
+ show_val_kb(m, "Dirty: ", mi->dirty_pages);
+ show_val_kb(m, "Writeback: ", mi->writeback_pages);
+
+ show_val_kb(m, "AnonPages: ", mi->anon_pages);
+ show_val_kb(m, "Mapped: ", mi->mapped);
+ show_val_kb(m, "Shmem: ", mi->si.sharedram);
+ show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable);
+ show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable);
+ show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable);
+ show_val_kb(m, "PageTables: ", mi->nr_pagetable);
+
+ return 0;
+}
+
static int meminfo_proc_show(struct seq_file *m, void *v)
{
- struct sysinfo i;
- unsigned long committed;
- long cached;
- long available;
- unsigned long pages[NR_LRU_LISTS];
- unsigned long sreclaimable, sunreclaim;
- int lru;
- si_meminfo(&i);
- si_swapinfo(&i);
- committed = vm_memory_committed();
+ struct meminfo mi;
- cached = global_node_page_state(NR_FILE_PAGES) -
- total_swapcache_pages() - i.bufferram;
- if (cached < 0)
- cached = 0;
+ proc_fill_meminfo(&mi);
+ meminfo_proc_show_mi(m, &mi);
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
-
- available = si_mem_available();
- sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
- sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
-
- show_val_kb(m, "MemTotal: ", i.totalram);
- show_val_kb(m, "MemFree: ", i.freeram);
- show_val_kb(m, "MemAvailable: ", available);
- show_val_kb(m, "Buffers: ", i.bufferram);
- show_val_kb(m, "Cached: ", cached);
- show_val_kb(m, "SwapCached: ", total_swapcache_pages());
- show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
- pages[LRU_ACTIVE_FILE]);
- show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
- pages[LRU_INACTIVE_FILE]);
- show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
- show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
- show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
- show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
- show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
-#ifdef CONFIG_HIGHMEM
- show_val_kb(m, "HighTotal: ", i.totalhigh);
- show_val_kb(m, "HighFree: ", i.freehigh);
- show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
- show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
-#endif
-
#ifndef CONFIG_MMU
show_val_kb(m, "MmapCopy: ",
(unsigned long)atomic_long_read(&mmap_pages_allocated));
#endif
- show_val_kb(m, "SwapTotal: ", i.totalswap);
- show_val_kb(m, "SwapFree: ", i.freeswap);
- show_val_kb(m, "Dirty: ",
- global_node_page_state(NR_FILE_DIRTY));
- show_val_kb(m, "Writeback: ",
- global_node_page_state(NR_WRITEBACK));
- show_val_kb(m, "AnonPages: ",
- global_node_page_state(NR_ANON_MAPPED));
- show_val_kb(m, "Mapped: ",
- global_node_page_state(NR_FILE_MAPPED));
- show_val_kb(m, "Shmem: ", i.sharedram);
- show_val_kb(m, "KReclaimable: ", sreclaimable +
+ show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable +
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
- show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
- show_val_kb(m, "SReclaimable: ", sreclaimable);
- show_val_kb(m, "SUnreclaim: ", sunreclaim);
seq_printf(m, "KernelStack: %8lu kB\n",
global_node_page_state(NR_KERNEL_STACK_KB));
#ifdef CONFIG_SHADOW_CALL_STACK
seq_printf(m, "ShadowCallStack:%8lu kB\n",
global_node_page_state(NR_KERNEL_SCS_KB));
#endif
- show_val_kb(m, "PageTables: ",
- global_node_page_state(NR_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
show_val_kb(m, "Bounce: ",
@@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "WritebackTmp: ",
global_node_page_state(NR_WRITEBACK_TEMP));
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
- show_val_kb(m, "Committed_AS: ", committed);
+ show_val_kb(m, "Committed_AS: ", vm_memory_committed());
seq_printf(m, "VmallocTotal: %8lu kB\n",
(unsigned long)VMALLOC_TOTAL >> 10);
show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
@@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
return 0;
}
+int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ struct meminfo mi;
+
+ fill_meminfo(&mi, task);
+
+ meminfo_proc_show_mi(m, &mi);
+ hugetlb_report_meminfo(m);
+ arch_report_meminfo(m);
+
+ return 0;
+}
+
static int __init proc_meminfo_init(void)
{
proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c193be760709..4a7e2894954f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
+void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task);
+
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c274f75efcf9..7faeaddd5b88 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn)
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif
+struct meminfo {
+ struct sysinfo si;
+ unsigned long pages[NR_LRU_LISTS];
+ unsigned long cached;
+ unsigned long swapcached;
+ unsigned long anon_pages;
+ unsigned long mapped;
+ unsigned long nr_pagetable;
+ unsigned long dirty_pages;
+ unsigned long writeback_pages;
+ unsigned long slab_reclaimable;
+ unsigned long slab_unreclaimable;
+};
+
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_range(unsigned long, int, unsigned long,
unsigned long, unsigned long, enum meminit_context,
@@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
extern void show_mem(unsigned int flags, nodemask_t *nodemask);
+extern long si_mem_available_mi(struct meminfo *mi);
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ada9e650a5..344b546f9e25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
return nr;
}
+static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages)
+{
+ struct mem_cgroup *iter;
+ int i;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false);
+ }
+}
+
+static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task)
+{
+ unsigned long memtotal, memused, swapsize;
+ struct mem_cgroup *memcg;
+ struct cgroup_subsys_state *css;
+
+ css = task_css(task, memory_cgrp_id);
+ memcg = mem_cgroup_from_css(css);
+
+ memtotal = READ_ONCE(memcg->memory.max);
+
+ if (memtotal != PAGE_COUNTER_MAX) {
+ memused = page_counter_read(&memcg->memory);
+
+ si->totalram = memtotal;
+ si->freeram = (memtotal > memused ? memtotal - memused : 0);
+ si->sharedram = memcg_page_state(memcg, NR_SHMEM);
+
+ si->bufferram = nr_blockdev_pages();
+ si->totalhigh = totalhigh_pages();
+ si->freehigh = nr_free_highpages();
+ si->mem_unit = PAGE_SIZE;
+ } else {
+ si_meminfo(si);
+ memused = si->totalram - si->freeram;
+ }
+
+ swapsize = READ_ONCE(memcg->memsw.max);
+
+ if (swapsize != PAGE_COUNTER_MAX) {
+ unsigned long swaptotal, swapused;
+
+ swaptotal = swapsize - memtotal;
+ swapused = page_counter_read(&memcg->memsw) - memused;
+ si->totalswap = swaptotal;
+ /* Due to global reclaim, memory.memsw.usage can be greater than
+ * (memory.memsw.max - memory.max). */
+ si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0);
+ } else {
+ si_swapinfo(si);
+ }
+
+ css_put(css);
+}
+
+void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task)
+{
+ struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css);
+ int nid;
+
+ memset(&mi->pages, 0, sizeof(mi->pages));
+
+ mem_cgroup_si_meminfo(&mi->si, task);
+
+ for_each_online_node(nid)
+ mem_cgroup_nr_pages(memcg, nid, mi->pages);
+
+ mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B);
+ mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ mi->cached = memcg_page_state(memcg, NR_FILE_PAGES);
+ mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE);
+ mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED);
+ mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED);
+ mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE);
+ mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY);
+ mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK);
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aaa1655cf682..0a3c9dcd2c13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
-long si_mem_available(void)
+long si_mem_available_mi(struct meminfo *mi)
{
long available;
unsigned long pagecache;
unsigned long wmark_low = 0;
- unsigned long pages[NR_LRU_LISTS];
unsigned long reclaimable;
struct zone *zone;
- int lru;
-
- for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
wmark_low += low_wmark_pages(zone);
@@ -5571,14 +5566,14 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = mi->si.freeram - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
* start swapping. Assume at least half of the page cache, or the
* low watermark worth of cache, needs to stay.
*/
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
available += pagecache;
@@ -5587,14 +5582,27 @@ long si_mem_available(void)
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
- reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
return available;
}
+
+long si_mem_available(void)
+{
+ struct meminfo mi;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
+
+ mi.si.freeram = global_zone_page_state(NR_FREE_PAGES);
+ mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
+
+ return si_mem_available_mi(&mi);
+}
EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
--
2.29.3
[Cc linux-api]
On Thu 03-06-21 12:43:07, [email protected] wrote:
> From: Alexey Gladkov <[email protected]>
>
> The /proc/meminfo contains information regardless of the cgroups
> restrictions. This file is still widely used [1]. This means that all
> these programs will not work correctly inside container [2][3][4]. Some
> programs try to respect the cgroups limits, but not all of them
> implement support for all cgroup versions [5].
>
> Correct information can be obtained from cgroups, but this requires the
> cgroups to be available inside container and the correct version of
> cgroups to be supported.
>
> There is lxcfs [6] that emulates /proc/meminfo using fuse to provide
> information regarding cgroups. This patch can help them.
>
> This patch adds /proc/self/meminfo that contains a subset of
> /proc/meminfo respecting cgroup restrictions.
>
> We cannot just create /proc/self/meminfo and make a symlink at the old
> location because this will break the existing apparmor rules [7].
> Therefore, the patch adds a separate file with the same format.
>
> [1] https://codesearch.debian.net/search?q=%2Fproc%2Fmeminfo
> [2] https://sources.debian.org/src/erlang/1:23.2.6+dfsg-1/lib/os_mon/c_src/memsup.c#L300
> [3] https://sources.debian.org/src/p7zip/16.02+dfsg-8/CPP/Windows/System.cpp/#L103
> [4] https://sources.debian.org/src/systemd/247.3-5/src/oom/oomd.c/#L138
> [5] https://sources.debian.org/src/nodejs/12.21.0%7Edfsg-4/deps/uv/src/unix/linux-core.c/#L1059
> [6] https://linuxcontainers.org/lxcfs/
> [7] https://gitlab.com/apparmor/apparmor/-/blob/master/profiles/apparmor.d/abstractions/base#L98
>
> Signed-off-by: Alexey Gladkov <[email protected]>
> ---
> fs/proc/base.c | 2 +
> fs/proc/internal.h | 6 ++
> fs/proc/meminfo.c | 160 +++++++++++++++++++++++--------------
> include/linux/memcontrol.h | 2 +
> include/linux/mm.h | 15 ++++
> mm/memcontrol.c | 80 +++++++++++++++++++
> mm/page_alloc.c | 28 ++++---
> 7 files changed, 222 insertions(+), 71 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 58bbf334265b..e95837cf713f 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> #ifdef CONFIG_SECCOMP_CACHE_DEBUG
> ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
> #endif
> + ONE("meminfo", S_IRUGO, proc_meminfo_show),
> };
>
> static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
> @@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = {
> #ifdef CONFIG_SECCOMP_CACHE_DEBUG
> ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
> #endif
> + ONE("meminfo", S_IRUGO, proc_meminfo_show),
> };
>
> static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 03415f3fb3a8..a6e8540afbd3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -241,6 +241,12 @@ extern int proc_net_init(void);
> static inline int proc_net_init(void) { return 0; }
> #endif
>
> +/*
> + * meminfo.c
> + */
> +extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
> + struct pid *pid, struct task_struct *tsk);
> +
> /*
> * proc_self.c
> */
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 6fa761c9cc78..3587a79d4b96 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -16,6 +16,9 @@
> #ifdef CONFIG_CMA
> #include <linux/cma.h>
> #endif
> +#ifdef CONFIG_MEMCG
> +#include <linux/memcontrol.h>
> +#endif
> #include <asm/page.h>
> #include "internal.h"
>
> @@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
> {
> }
>
> +static void proc_fill_meminfo(struct meminfo *mi)
> +{
> + int lru;
> + long cached;
> +
> + si_meminfo(&mi->si);
> + si_swapinfo(&mi->si);
> +
> + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> +
> + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram;
> + if (cached < 0)
> + cached = 0;
> +
> + mi->cached = cached;
> + mi->swapcached = total_swapcache_pages();
> + mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> + mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
> + mi->anon_pages = global_node_page_state(NR_ANON_MAPPED);
> + mi->mapped = global_node_page_state(NR_FILE_MAPPED);
> + mi->nr_pagetable = global_node_page_state(NR_PAGETABLE);
> + mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY);
> + mi->writeback_pages = global_node_page_state(NR_WRITEBACK);
> +}
> +
> +#ifdef CONFIG_MEMCG
> +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + mem_fill_meminfo(mi, task);
> +}
> +#else
> +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + proc_fill_meminfo(mi);
> +}
> +#endif
> +
> static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
> {
> seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
> seq_write(m, " kB\n", 4);
> }
>
> +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
> +{
> + show_val_kb(m, "MemTotal: ", mi->si.totalram);
> + show_val_kb(m, "MemFree: ", mi->si.freeram);
> + show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi));
> + show_val_kb(m, "Buffers: ", mi->si.bufferram);
> + show_val_kb(m, "Cached: ", mi->cached);
> + show_val_kb(m, "SwapCached: ", mi->swapcached);
> + show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]);
> + show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]);
> + show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]);
> + show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]);
> + show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]);
> + show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]);
> + show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]);
> +
> +#ifdef CONFIG_HIGHMEM
> + show_val_kb(m, "HighTotal: ", mi->si.totalhigh);
> + show_val_kb(m, "HighFree: ", mi->si.freehigh);
> + show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh);
> + show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh);
> +#endif
> +
> + show_val_kb(m, "SwapTotal: ", mi->si.totalswap);
> + show_val_kb(m, "SwapFree: ", mi->si.freeswap);
> + show_val_kb(m, "Dirty: ", mi->dirty_pages);
> + show_val_kb(m, "Writeback: ", mi->writeback_pages);
> +
> + show_val_kb(m, "AnonPages: ", mi->anon_pages);
> + show_val_kb(m, "Mapped: ", mi->mapped);
> + show_val_kb(m, "Shmem: ", mi->si.sharedram);
> + show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable);
> + show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable);
> + show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable);
> + show_val_kb(m, "PageTables: ", mi->nr_pagetable);
> +
> + return 0;
> +}
> +
> static int meminfo_proc_show(struct seq_file *m, void *v)
> {
> - struct sysinfo i;
> - unsigned long committed;
> - long cached;
> - long available;
> - unsigned long pages[NR_LRU_LISTS];
> - unsigned long sreclaimable, sunreclaim;
> - int lru;
>
> - si_meminfo(&i);
> - si_swapinfo(&i);
> - committed = vm_memory_committed();
> + struct meminfo mi;
>
> - cached = global_node_page_state(NR_FILE_PAGES) -
> - total_swapcache_pages() - i.bufferram;
> - if (cached < 0)
> - cached = 0;
> + proc_fill_meminfo(&mi);
> + meminfo_proc_show_mi(m, &mi);
>
> - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> - pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> -
> - available = si_mem_available();
> - sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> - sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
> -
> - show_val_kb(m, "MemTotal: ", i.totalram);
> - show_val_kb(m, "MemFree: ", i.freeram);
> - show_val_kb(m, "MemAvailable: ", available);
> - show_val_kb(m, "Buffers: ", i.bufferram);
> - show_val_kb(m, "Cached: ", cached);
> - show_val_kb(m, "SwapCached: ", total_swapcache_pages());
> - show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
> - pages[LRU_ACTIVE_FILE]);
> - show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
> - pages[LRU_INACTIVE_FILE]);
> - show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
> - show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
> - show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
> - show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
> - show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
> show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
>
> -#ifdef CONFIG_HIGHMEM
> - show_val_kb(m, "HighTotal: ", i.totalhigh);
> - show_val_kb(m, "HighFree: ", i.freehigh);
> - show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
> - show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
> -#endif
> -
> #ifndef CONFIG_MMU
> show_val_kb(m, "MmapCopy: ",
> (unsigned long)atomic_long_read(&mmap_pages_allocated));
> #endif
>
> - show_val_kb(m, "SwapTotal: ", i.totalswap);
> - show_val_kb(m, "SwapFree: ", i.freeswap);
> - show_val_kb(m, "Dirty: ",
> - global_node_page_state(NR_FILE_DIRTY));
> - show_val_kb(m, "Writeback: ",
> - global_node_page_state(NR_WRITEBACK));
> - show_val_kb(m, "AnonPages: ",
> - global_node_page_state(NR_ANON_MAPPED));
> - show_val_kb(m, "Mapped: ",
> - global_node_page_state(NR_FILE_MAPPED));
> - show_val_kb(m, "Shmem: ", i.sharedram);
> - show_val_kb(m, "KReclaimable: ", sreclaimable +
> + show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable +
> global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
> - show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
> - show_val_kb(m, "SReclaimable: ", sreclaimable);
> - show_val_kb(m, "SUnreclaim: ", sunreclaim);
> seq_printf(m, "KernelStack: %8lu kB\n",
> global_node_page_state(NR_KERNEL_STACK_KB));
> #ifdef CONFIG_SHADOW_CALL_STACK
> seq_printf(m, "ShadowCallStack:%8lu kB\n",
> global_node_page_state(NR_KERNEL_SCS_KB));
> #endif
> - show_val_kb(m, "PageTables: ",
> - global_node_page_state(NR_PAGETABLE));
>
> show_val_kb(m, "NFS_Unstable: ", 0);
> show_val_kb(m, "Bounce: ",
> @@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "WritebackTmp: ",
> global_node_page_state(NR_WRITEBACK_TEMP));
> show_val_kb(m, "CommitLimit: ", vm_commit_limit());
> - show_val_kb(m, "Committed_AS: ", committed);
> + show_val_kb(m, "Committed_AS: ", vm_memory_committed());
> seq_printf(m, "VmallocTotal: %8lu kB\n",
> (unsigned long)VMALLOC_TOTAL >> 10);
> show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
> @@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> return 0;
> }
>
> +int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
> + struct pid *pid, struct task_struct *task)
> +{
> + struct meminfo mi;
> +
> + fill_meminfo(&mi, task);
> +
> + meminfo_proc_show_mi(m, &mi);
> + hugetlb_report_meminfo(m);
> + arch_report_meminfo(m);
> +
> + return 0;
> +}
> +
> static int __init proc_meminfo_init(void)
> {
> proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c193be760709..4a7e2894954f 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
> gfp_t gfp_mask,
> unsigned long *total_scanned);
>
> +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task);
> +
> #else /* CONFIG_MEMCG */
>
> #define MEM_CGROUP_ID_SHIFT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c274f75efcf9..7faeaddd5b88 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn)
> extern int __meminit early_pfn_to_nid(unsigned long pfn);
> #endif
>
> +struct meminfo {
> + struct sysinfo si;
> + unsigned long pages[NR_LRU_LISTS];
> + unsigned long cached;
> + unsigned long swapcached;
> + unsigned long anon_pages;
> + unsigned long mapped;
> + unsigned long nr_pagetable;
> + unsigned long dirty_pages;
> + unsigned long writeback_pages;
> + unsigned long slab_reclaimable;
> + unsigned long slab_unreclaimable;
> +};
> +
> extern void set_dma_reserve(unsigned long new_dma_reserve);
> extern void memmap_init_range(unsigned long, int, unsigned long,
> unsigned long, unsigned long, enum meminit_context,
> @@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void);
> extern void mem_init(void);
> extern void __init mmap_init(void);
> extern void show_mem(unsigned int flags, nodemask_t *nodemask);
> +extern long si_mem_available_mi(struct meminfo *mi);
> extern long si_mem_available(void);
> extern void si_meminfo(struct sysinfo * val);
> extern void si_meminfo_node(struct sysinfo *val, int nid);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 64ada9e650a5..344b546f9e25 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
> return nr;
> }
>
> +static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages)
> +{
> + struct mem_cgroup *iter;
> + int i;
> +
> + for_each_mem_cgroup_tree(iter, memcg) {
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false);
> + }
> +}
> +
> +static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task)
> +{
> + unsigned long memtotal, memused, swapsize;
> + struct mem_cgroup *memcg;
> + struct cgroup_subsys_state *css;
> +
> + css = task_css(task, memory_cgrp_id);
> + memcg = mem_cgroup_from_css(css);
> +
> + memtotal = READ_ONCE(memcg->memory.max);
> +
> + if (memtotal != PAGE_COUNTER_MAX) {
> + memused = page_counter_read(&memcg->memory);
> +
> + si->totalram = memtotal;
> + si->freeram = (memtotal > memused ? memtotal - memused : 0);
> + si->sharedram = memcg_page_state(memcg, NR_SHMEM);
> +
> + si->bufferram = nr_blockdev_pages();
> + si->totalhigh = totalhigh_pages();
> + si->freehigh = nr_free_highpages();
> + si->mem_unit = PAGE_SIZE;
> + } else {
> + si_meminfo(si);
> + memused = si->totalram - si->freeram;
> + }
> +
> + swapsize = READ_ONCE(memcg->memsw.max);
> +
> + if (swapsize != PAGE_COUNTER_MAX) {
> + unsigned long swaptotal, swapused;
> +
> + swaptotal = swapsize - memtotal;
> + swapused = page_counter_read(&memcg->memsw) - memused;
> + si->totalswap = swaptotal;
> + /* Due to global reclaim, memory.memsw.usage can be greater than
> + * (memory.memsw.max - memory.max). */
> + si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0);
> + } else {
> + si_swapinfo(si);
> + }
> +
> + css_put(css);
> +}
> +
> +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id);
> + struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css);
> + int nid;
> +
> + memset(&mi->pages, 0, sizeof(mi->pages));
> +
> + mem_cgroup_si_meminfo(&mi->si, task);
> +
> + for_each_online_node(nid)
> + mem_cgroup_nr_pages(memcg, nid, mi->pages);
> +
> + mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B);
> + mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
> + mi->cached = memcg_page_state(memcg, NR_FILE_PAGES);
> + mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE);
> + mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED);
> + mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED);
> + mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE);
> + mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY);
> + mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK);
> +}
> +
> static int memcg_numa_stat_show(struct seq_file *m, void *v)
> {
> struct numa_stat {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index aaa1655cf682..0a3c9dcd2c13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone)
> printk("Node %d ", zone_to_nid(zone));
> }
>
> -long si_mem_available(void)
> +long si_mem_available_mi(struct meminfo *mi)
> {
> long available;
> unsigned long pagecache;
> unsigned long wmark_low = 0;
> - unsigned long pages[NR_LRU_LISTS];
> unsigned long reclaimable;
> struct zone *zone;
> - int lru;
> -
> - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> - pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
>
> for_each_zone(zone)
> wmark_low += low_wmark_pages(zone);
> @@ -5571,14 +5566,14 @@ long si_mem_available(void)
> * Estimate the amount of memory available for userspace allocations,
> * without causing swapping.
> */
> - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
> + available = mi->si.freeram - totalreserve_pages;
>
> /*
> * Not all the page cache can be freed, otherwise the system will
> * start swapping. Assume at least half of the page cache, or the
> * low watermark worth of cache, needs to stay.
> */
> - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
> + pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE];
> pagecache -= min(pagecache / 2, wmark_low);
> available += pagecache;
>
> @@ -5587,14 +5582,27 @@ long si_mem_available(void)
> * items that are in use, and cannot be freed. Cap this estimate at the
> * low watermark.
> */
> - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
> - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
> + reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
> available += reclaimable - min(reclaimable / 2, wmark_low);
>
> if (available < 0)
> available = 0;
> return available;
> }
> +
> +long si_mem_available(void)
> +{
> + struct meminfo mi;
> + int lru;
> +
> + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> +
> + mi.si.freeram = global_zone_page_state(NR_FREE_PAGES);
> + mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> +
> + return si_mem_available_mi(&mi);
> +}
> EXPORT_SYMBOL_GPL(si_mem_available);
>
> void si_meminfo(struct sysinfo *val)
> --
> 2.29.3
--
Michal Hocko
SUSE Labs
Hi Alexey,
[email protected] writes:
>From: Alexey Gladkov <[email protected]>
>The /proc/meminfo contains information regardless of the cgroups
>restrictions. This file is still widely used [1]. This means that all
>these programs will not work correctly inside container [2][3][4]. Some
>programs try to respect the cgroups limits, but not all of them
>implement support for all cgroup versions [5].
>
>Correct information can be obtained from cgroups, but this requires the
>cgroups to be available inside container and the correct version of
>cgroups to be supported.
Then they should add support for it. We already export these metrics as part of
cgroups and plenty of applications like Docker, podman, containerd, systemd,
runc, etc already support it.
Putting stuff in /proc to get around the problem of "some other metric I need
might not be exported to a container" is not a very compelling argument. If
they want it, then export it to the container...
Ultimately, if they're going to have to add support for a new
/proc/self/meminfo file anyway, these use cases should just do it properly
through the already supported APIs.
>+ for_each_online_node(nid)
>+ mem_cgroup_nr_pages(memcg, nid, mi->pages);
>+
>+ mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B);
>+ mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
>+ mi->cached = memcg_page_state(memcg, NR_FILE_PAGES);
>+ mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE);
>+ mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED);
>+ mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED);
>+ mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE);
>+ mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY);
>+ mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK);
>+}
This presents an extraordinarily confusing API. A cgroup can contain more than
one process, so it's not right to present this information as "meminfo" in
/proc/self when these statistics may not have any relation to the current task
under question.
On 03.06.21 13:33, Chris Down wrote:
Hi folks,
> Putting stuff in /proc to get around the problem of "some other metric I
> need might not be exported to a container" is not a very compelling
> argument. If they want it, then export it to the container...
>
> Ultimately, if they're going to have to add support for a new
> /proc/self/meminfo file anyway, these use cases should just do it
> properly through the already supported APIs.
It's even a bit more complex ...
/proc/meminfo always tells what the *machine* has available, not what a
process can eat up. That has been this way even long before cgroups.
(eg. ulimits).
Even if you want a container look more like a VM - /proc/meminfo showing
what the container (instead of the machine) has available - just looking
at the calling task's cgroup is also wrong. Because there're cgroups
outside containers (that really shouldn't be affected) and there're even
other cgroups inside the container (that further restrict below the
container's limits).
BTW: applications trying to autotune themselves by looking at
/proc/meminfo are broken-by-design anyways. This never has been a valid
metric on how much memory invididual processes can or should eat.
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287
"Enrico Weigelt, metux IT consult" <[email protected]> writes:
> On 03.06.21 13:33, Chris Down wrote:
>
> Hi folks,
>
>
>> Putting stuff in /proc to get around the problem of "some other metric I need
>> might not be exported to a container" is not a very compelling argument. If
>> they want it, then export it to the container...
>>
>> Ultimately, if they're going to have to add support for a new
>> /proc/self/meminfo file anyway, these use cases should just do it properly
>> through the already supported APIs.
>
> It's even a bit more complex ...
>
> /proc/meminfo always tells what the *machine* has available, not what a
> process can eat up. That has been this way even long before cgroups.
> (eg. ulimits).
>
> Even if you want a container look more like a VM - /proc/meminfo showing
> what the container (instead of the machine) has available - just looking
> at the calling task's cgroup is also wrong. Because there're cgroups
> outside containers (that really shouldn't be affected) and there're even
> other cgroups inside the container (that further restrict below the
> container's limits).
>
> BTW: applications trying to autotune themselves by looking at
> /proc/meminfo are broken-by-design anyways. This never has been a valid
> metric on how much memory invididual processes can or should eat.
Which brings us to the problem.
Using /proc/meminfo is not valid unless your application can know it has
the machine to itself. Something that is becoming increasing less
common.
Unless something has changed in the last couple of years, reading values
out of the cgroup filesystem is both difficult (v1 and v2 have some
gratuitous differences) and is actively discouraged.
So what should applications do?
Alex has found applications that are trying to do something with
meminfo, and the fields that those applications care about. I don't see
anyone making the case that specifically what the applications are
trying to do is buggy.
Alex's suggest is to have a /proc/self/meminfo that has the information
that applications want, which would be something that would be easy
to switch applications to. The patch to userspace at that point is
as simple as 3 lines of code. I can imagine people take that patch into
their userspace programs.
The simple fact that people are using /proc/meminfo when it doesn't make
sense for anything except system monitoring tools is a pretty solid bug
report on the existing linux apis.
So how do people suggest these applications get the information they
need?
Alex perhaps I missed it, but I didn't see a lot of description on why
the individual applications are using meminfo. Can you provide a bit
more detail? At least for a several of them, so we can see the trends
and say yes this kind of information makes sense to provide to
applications. I think that would help move forward the discussion about
the best way to provide that information to userspace.
Eric
On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote:
> "Enrico Weigelt, metux IT consult" <[email protected]> writes:
>
> > On 03.06.21 13:33, Chris Down wrote:
> >
> > Hi folks,
> >
> >
> >> Putting stuff in /proc to get around the problem of "some other metric I need
> >> might not be exported to a container" is not a very compelling argument. If
> >> they want it, then export it to the container...
> >>
> >> Ultimately, if they're going to have to add support for a new
> >> /proc/self/meminfo file anyway, these use cases should just do it properly
> >> through the already supported APIs.
> >
> > It's even a bit more complex ...
> >
> > /proc/meminfo always tells what the *machine* has available, not what a
> > process can eat up. That has been this way even long before cgroups.
> > (eg. ulimits).
> >
> > Even if you want a container look more like a VM - /proc/meminfo showing
> > what the container (instead of the machine) has available - just looking
> > at the calling task's cgroup is also wrong. Because there're cgroups
> > outside containers (that really shouldn't be affected) and there're even
> > other cgroups inside the container (that further restrict below the
> > container's limits).
> >
> > BTW: applications trying to autotune themselves by looking at
> > /proc/meminfo are broken-by-design anyways. This never has been a valid
> > metric on how much memory invididual processes can or should eat.
>
> Which brings us to the problem.
>
> Using /proc/meminfo is not valid unless your application can know it has
> the machine to itself. Something that is becoming increasing less
> common.
>
> Unless something has changed in the last couple of years, reading values
> out of the cgroup filesystem is both difficult (v1 and v2 have some
> gratuitous differences) and is actively discouraged.
>
> So what should applications do?
>
> Alex has found applications that are trying to do something with
> meminfo, and the fields that those applications care about. I don't see
> anyone making the case that specifically what the applications are
> trying to do is buggy.
>
> Alex's suggest is to have a /proc/self/meminfo that has the information
> that applications want, which would be something that would be easy
> to switch applications to. The patch to userspace at that point is
> as simple as 3 lines of code. I can imagine people take that patch into
> their userspace programs.
But is it actually what applications want?
Not all the information at the system level translates well to the
container level. Things like available memory require a hierarchical
assessment rather than just a look at the local level, since there
could be limits higher up the tree.
Not all items in meminfo have a container equivalent, either.
The familiar format is likely a liability rather than an asset.
> The simple fact that people are using /proc/meminfo when it doesn't make
> sense for anything except system monitoring tools is a pretty solid bug
> report on the existing linux apis.
I agree that we likely need a better interface for applications to
query the memory state of their container. But I don't think we should
try to emulate a format that is a poor fit for this.
We should also not speculate what users intended to do with the
meminfo data right now. There is a surprising amount of misconception
around what these values actually mean. I'd rather have users show up
on the mailing list directly and outline the broader usecase.
Johannes Weiner <[email protected]> writes:
> On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote:
>> "Enrico Weigelt, metux IT consult" <[email protected]> writes:
>>
>> > On 03.06.21 13:33, Chris Down wrote:
>> >
>> > Hi folks,
>> >
>> >
>> >> Putting stuff in /proc to get around the problem of "some other metric I need
>> >> might not be exported to a container" is not a very compelling argument. If
>> >> they want it, then export it to the container...
>> >>
>> >> Ultimately, if they're going to have to add support for a new
>> >> /proc/self/meminfo file anyway, these use cases should just do it properly
>> >> through the already supported APIs.
>> >
>> > It's even a bit more complex ...
>> >
>> > /proc/meminfo always tells what the *machine* has available, not what a
>> > process can eat up. That has been this way even long before cgroups.
>> > (eg. ulimits).
>> >
>> > Even if you want a container look more like a VM - /proc/meminfo showing
>> > what the container (instead of the machine) has available - just looking
>> > at the calling task's cgroup is also wrong. Because there're cgroups
>> > outside containers (that really shouldn't be affected) and there're even
>> > other cgroups inside the container (that further restrict below the
>> > container's limits).
>> >
>> > BTW: applications trying to autotune themselves by looking at
>> > /proc/meminfo are broken-by-design anyways. This never has been a valid
>> > metric on how much memory invididual processes can or should eat.
>>
>> Which brings us to the problem.
>>
>> Using /proc/meminfo is not valid unless your application can know it has
>> the machine to itself. Something that is becoming increasing less
>> common.
>>
>> Unless something has changed in the last couple of years, reading values
>> out of the cgroup filesystem is both difficult (v1 and v2 have some
>> gratuitous differences) and is actively discouraged.
>>
>> So what should applications do?
>>
>> Alex has found applications that are trying to do something with
>> meminfo, and the fields that those applications care about. I don't see
>> anyone making the case that specifically what the applications are
>> trying to do is buggy.
>>
>> Alex's suggest is to have a /proc/self/meminfo that has the information
>> that applications want, which would be something that would be easy
>> to switch applications to. The patch to userspace at that point is
>> as simple as 3 lines of code. I can imagine people take that patch into
>> their userspace programs.
>
> But is it actually what applications want?
>
> Not all the information at the system level translates well to the
> container level. Things like available memory require a hierarchical
> assessment rather than just a look at the local level, since there
> could be limits higher up the tree.
That sounds like a bug in the implementation of /proc/self/meminfo.
It certainly is a legitimate question to ask what are the limits
from my perspective.
> Not all items in meminfo have a container equivalent, either.
Not all items in meminfo were implemented.
> The familiar format is likely a liability rather than an asset.
It could be. At the same time that is the only format anyone has
proposed so we good counter proposal would be appreciated if you don't
like the code that has been written.
>> The simple fact that people are using /proc/meminfo when it doesn't make
>> sense for anything except system monitoring tools is a pretty solid bug
>> report on the existing linux apis.
>
> I agree that we likely need a better interface for applications to
> query the memory state of their container. But I don't think we should
> try to emulate a format that is a poor fit for this.
I don't think it is the container that we care about (except for maybe
system managment tools). I think the truly interesting case is
applications asking what do I have available to me.
> We should also not speculate what users intended to do with the
> meminfo data right now. There is a surprising amount of misconception
> around what these values actually mean. I'd rather have users show up
> on the mailing list directly and outline the broader usecase.
We are kernel developers, we can read code. We don't need to speculate.
We can read the userspace code. If things are not clear we can ask
their developers.
Eric
On 6/9/21 16:56, Eric W. Biederman wrote:
> Johannes Weiner <[email protected]> writes:
>
>> On Wed, Jun 09, 2021 at 02:14:16PM -0500, Eric W. Biederman wrote:
>>> "Enrico Weigelt, metux IT consult" <[email protected]> writes:
>>>
>>>> On 03.06.21 13:33, Chris Down wrote:
>>>>
>>>> Hi folks,
>>>>
>>>>
>>>>> Putting stuff in /proc to get around the problem of "some other metric I need
>>>>> might not be exported to a container" is not a very compelling argument. If
>>>>> they want it, then export it to the container...
>>>>>
>>>>> Ultimately, if they're going to have to add support for a new
>>>>> /proc/self/meminfo file anyway, these use cases should just do it properly
>>>>> through the already supported APIs.
>>>> It's even a bit more complex ...
>>>>
>>>> /proc/meminfo always tells what the *machine* has available, not what a
>>>> process can eat up. That has been this way even long before cgroups.
>>>> (eg. ulimits).
>>>>
>>>> Even if you want a container look more like a VM - /proc/meminfo showing
>>>> what the container (instead of the machine) has available - just looking
>>>> at the calling task's cgroup is also wrong. Because there're cgroups
>>>> outside containers (that really shouldn't be affected) and there're even
>>>> other cgroups inside the container (that further restrict below the
>>>> container's limits).
>>>>
>>>> BTW: applications trying to autotune themselves by looking at
>>>> /proc/meminfo are broken-by-design anyways. This never has been a valid
>>>> metric on how much memory invididual processes can or should eat.
>>> Which brings us to the problem.
>>>
>>> Using /proc/meminfo is not valid unless your application can know it has
>>> the machine to itself. Something that is becoming increasing less
>>> common.
>>>
>>> Unless something has changed in the last couple of years, reading values
>>> out of the cgroup filesystem is both difficult (v1 and v2 have some
>>> gratuitous differences) and is actively discouraged.
>>>
>>> So what should applications do?
>>>
>>> Alex has found applications that are trying to do something with
>>> meminfo, and the fields that those applications care about. I don't see
>>> anyone making the case that specifically what the applications are
>>> trying to do is buggy.
>>>
>>> Alex's suggest is to have a /proc/self/meminfo that has the information
>>> that applications want, which would be something that would be easy
>>> to switch applications to. The patch to userspace at that point is
>>> as simple as 3 lines of code. I can imagine people take that patch into
>>> their userspace programs.
>> But is it actually what applications want?
>>
>> Not all the information at the system level translates well to the
>> container level. Things like available memory require a hierarchical
>> assessment rather than just a look at the local level, since there
>> could be limits higher up the tree.
> That sounds like a bug in the implementation of /proc/self/meminfo.
>
> It certainly is a legitimate question to ask what are the limits
> from my perspective.
>
>> Not all items in meminfo have a container equivalent, either.
> Not all items in meminfo were implemented.
>
>> The familiar format is likely a liability rather than an asset.
> It could be. At the same time that is the only format anyone has
> proposed so we good counter proposal would be appreciated if you don't
> like the code that has been written.
>
>>> The simple fact that people are using /proc/meminfo when it doesn't make
>>> sense for anything except system monitoring tools is a pretty solid bug
>>> report on the existing linux apis.
>> I agree that we likely need a better interface for applications to
>> query the memory state of their container. But I don't think we should
>> try to emulate a format that is a poor fit for this.
> I don't think it is the container that we care about (except for maybe
> system managment tools). I think the truly interesting case is
> applications asking what do I have available to me.
Have heard that the JRE makes assumptions on the number of threads to
use based on memory.
Lots of Humans use top and vmstat to try to figure out what is available
in their environment. Debugging tools trying to figure out why an
application is running poorly.
We would like to not need to mount the cgroup file system into a
container at all, and as Eric stated processes trying to differentiate
between cgroupv1 and cgroupv2.
>> We should also not speculate what users intended to do with the
>> meminfo data right now. There is a surprising amount of misconception
>> around what these values actually mean. I'd rather have users show up
>> on the mailing list directly and outline the broader usecase.
> We are kernel developers, we can read code. We don't need to speculate.
> We can read the userspace code. If things are not clear we can ask
> their developers.
>
> Eric
>
>
On 09.06.21 22:31, Johannes Weiner wrote:
>> Alex has found applications that are trying to do something with
>> meminfo, and the fields that those applications care about. I don't see
>> anyone making the case that specifically what the applications are
>> trying to do is buggy.
Actually, I do. The assumptions made by those applications are most
certainly wrong - have been wrong ever since.
The problem is that just looking on these numbers is only helpful if
that application is pretty much alone on the machine. You'll find those
when the vendor explicitly tells you to run it on a standalone machine,
disable swap, etc. Database systems are probably the most prominent
sector here.
Onder certain circumstances this actually migh, heuristically, work,
but those kind of auto tuning (eg. automatically eat X% of ram for
buffers, etc) might work. Under certain circumstance.
Those assumptions already had been defeated by VMs (eg. overcommit).
I don't feel it's a good idea to add extra kernel code, just in order
to make some ill-designed applications work a little bit less bad
(which still needs to be changed anyways)
Don't get me wrong, I'm really in favour of having a clean interface
for telling applications how much resources they can take (actually
considered quite the same), but this needs to be very well thought (and
we should also get orchestration folks into the loop for that). This
basically falls into two categories:
a) hard limits - how much can an application possibly consume
--> what makes about an application ? process ? process group ?
cgroup ? namespace ?
b) reasonable defaults - how much can an application take at will, w/o
affecting others ?
--> what exactly is "reasonable" ? kernel cache vs. userland buffers?
--> how to deal w/ overcommit scenarios ?
--> who shall be in charge of controlling these values ?
It's a very complex problem, not easy to solve. Much of that seems to be
a matter of policies, and depending on actual workloads.
Maybe, for now, it's better pursue that on orchestration level.
> Not all the information at the system level translates well to the
> container level. Things like available memory require a hierarchical
> assessment rather than just a look at the local level, since there
> could be limits higher up the tree.
By the way: what exactly *is* a container anyways ?
The mainline kernel (in contrast to openvz kernel) doesn't actually know
about containers at all - instead is provides several concepts like
namespaces, cgroups, etc, that together are used for providing some
container environment - but that composition is done in userland, and
there're several approaches w/ different semantics.
Before we can do anything container specific in the kernel, we first
have to come to an general aggreement what actually is a container from
kernel perspective. No idea whether we can achieve that at all (in near
future) w/o actually introducing the concept of container within the
kernel.
> We should also not speculate what users intended to do with the
> meminfo data right now. There is a surprising amount of misconception
> around what these values actually mean. I'd rather have users show up
> on the mailing list directly and outline the broader usecase.
ACK.
The only practical use cases I'm aware of is:
a) safety: know how much memory I can eat, until I get -ENOMEM, so
applications can proactively take counter measures, eg. pre
allocation (common practise in safety related applications)
b) autotuning: how much shall the application take for caches or
buffers. this is problematic, since it can only work on heuristics,
which in turn can only be experimentally found within certain range
of assumptions (eg. certain databases like to do that). By that way
you can only find more or less reasonable parameters for the majority
of cases (assuming you have an idea what that majority actually is),
but still far from optimal. for *good* parameters you need to measure
your actual workloads and applying good knowledge of what this
application is actually doing. (one of the DBA's primary jobs)
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287
On Thu, Jun 03, 2021 at 12:43:07PM +0200, [email protected] wrote:
> From: Alexey Gladkov <[email protected]>
>
> The /proc/meminfo contains information regardless of the cgroups
> restrictions. This file is still widely used [1]. This means that all
> these programs will not work correctly inside container [2][3][4]. Some
> programs try to respect the cgroups limits, but not all of them
> implement support for all cgroup versions [5].
>
> Correct information can be obtained from cgroups, but this requires the
> cgroups to be available inside container and the correct version of
> cgroups to be supported.
>
> There is lxcfs [6] that emulates /proc/meminfo using fuse to provide
> information regarding cgroups. This patch can help them.
>
> This patch adds /proc/self/meminfo that contains a subset of
> /proc/meminfo respecting cgroup restrictions.
>
> We cannot just create /proc/self/meminfo and make a symlink at the old
> location because this will break the existing apparmor rules [7].
> Therefore, the patch adds a separate file with the same format.
Interesting work. Thanks. This is basically a variant of what I
suggested at Plumbers and in [1].
Judging from the patches sent by Waiman Long in [2] to also virtualize
/proc/cpuinfo and /sys/devices/system/cpu this is a larger push to
provide virtualized system information to containers.
Although somewhere in the thread here this veered off into apparently
just being a way for a process to gather information about it's own
resources. At which point I'm confused why looking at its cgroups
isn't enough.
So /proc/self/meminfo seems to just be the start. And note the two
approaches seem to diverge too. This provides a new file while the other
patchset virtualizes existing proc files/folders.
In any case it seems you might want to talk since afaict you're all at
the same company but don't seem to be aware of each others work (Which
happens of course.).
For the sake of history such patchsets have been pushed for before by
the Siteground people.
Chris and Johannes made a good point that the information provided in
this file can be gathered from cgroups already. So applications should
probably switch to reading those out of their cgroup and most are doing
that already.
And reading values out of cgroups is pretty straightforward even with
the differences between cgroup v1 and v2. Userspace is doing it all over
the place all of the time and the code has now existed for years so the
cgroup interface is a problem. And with cgroup v2 it keeps growing so
much more useful metrics that looking at meminfo isn't really cutting it
anyway.
So I think the argument that applications should start looking at their
cgroup info if they want to find out detailed info is a solid argument
that shouldn't be easily brushed aside.
What might be worth is knowing exactly what applications are looking at
/proc/meminfo and /proc/cpuinfo and make decision based on that info.
None of that is clearly outlined in the thread unfortunately.
So I immediately see two types of applications that could benefit from
this patchset. The first ones are legacy applications that aren't aware
of cgroups and aren't actively maintained. Introducing such
functionality for these applications seems a weak argument.
The second type is new and maintained applications that look at global
info such as /proc/meminfo and /proc/cpuinfo. So such applications have
ignored cgroups for a decade now. This makes it very unconvincing that
they will suddenly switch to a newly introduced file. Especially if the
entries in a new file aren't a 1:1 mapping of the old file.
Johannes made another good point about it not being clear what
applications actually want. And he's very right in that. It seems
straightforward to virtualize things like meminfo but it actually isn't.
And it's something you quite often discover after the fact. We have
extensive experience implementing it in LXCFS in userspace. People kept
and keep arguing what information exactly is supposed to go into
calculating those values based on what best helps their use-case.
Swap was an especially contentious point. In fact, sometimes users want
to turn of swap even though it exists on the host and there's a command
line switch in LXCFS to control that behavior.
Another example supporting Johannes worry is virtualizing /proc/cpuinfo
where some people wanted to virtualize cpu counts based on cpu shares.
So we have two modes to virtualize cpus: based on cpuset alone or based
on cpuset and cpu shares. And both modes are actively used. And that all
really depends on application and workload.
Finally, although LXCFS is briefly referenced in the commit message but
it isn't explained very well and what it does.
And we should consider it since this is a full existing userspace
solution to the problem solved in this patchset including Dan's JRE
use-case.
This is a project started in 2014 and it is in production use since 2014
and it delivers the features of this patchset here and more.
For example, it's used in the Linux susbystem of Chromebooks, it's used
by Alibaba (see [3]) and it is used for the JRE use-case by Google's
Anthos when migrating such legacy applications (see [4]).
At first, I was convinced we could make use of /proc/self/meminfo in
LXCFS which is why I held back but we can't. We can't simply bind-mount
it over /proc/meminfo because it's not a 1:1 correspondence between all
fields. We could potentially read some values we now calculate and
display it in /proc/meminfo but we can't stop virtualizing /proc/meminfo
itself. So we don't gain anything from this. When Alex asked me about it
I tried to come up with good ways to integrate this but the gain is just
too little for us.
Because our experience tells us that applications that want this type of
virtualization don't really care about heir own resources. They care
about a virtualized view of the system's resources. And the system in
question is often a container. But it get's very tricky since we don't
really define what a container is. So what data the user wants to see
depends on the used container runtime, type of container, and workload.
An application container has very different needs than a system
container that boots systemd. LXCFS can be very flexible here and
virtualize according to the users preferences (see the split between
cpuset and cpuset + cpu shares virtualization for cpu counts).
In any case, LXCFS is a tiny FUSE filesystem which virtualizes various
procfs and sysfs files for a container:
/proc/cpuinfo
/proc/diskstats
/proc/meminfo
/proc/stat
/proc/swaps
/proc/uptime
/proc/slabinfo
/sys/devices/system/cpu/*
/sys/devices/system/cpu/online
If you call top in a container that makes use of this it will display
everything virtualized to the container (See [5] for an example of
/proc/cpuinfo and /sys/devices/system/cpu/*.). And JRE will not
overallocate resources. It's actively used for all of that.
Below at [5] you can find an example where 2 cpus out of 8 have been
assigned to the container's cpuset. The container values are virtualized
as you can see.
[1]: https://lkml.org/lkml/2020/6/4/951
[2]: https://lore.kernel.org/lkml/YMe/[email protected]
[3]: https://www.alibabacloud.com/blog/kubernetes-demystified-using-lxcfs-to-improve-container-resource-visibility_594109
[4]: https://cloud.google.com/blog/products/containers-kubernetes/migrate-for-anthos-streamlines-legacy-java-app-modernization
[5]: ## /proc/cpuinfo
#### Host
brauner@wittgenstein|~
> ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]]
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu0
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu1
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu2
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu3
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu4
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu5
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu6
drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu7
#### Container
brauner@wittgenstein|~
> lxc exec f1 -- ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]]
drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu3
drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu4
## /sys/devices/system/cpu/*
#### Host
brauner@wittgenstein|~
> grep ^processor /proc/cpuinfo
processor : 0
processor : 1
processor : 2
processor : 3
processor : 4
processor : 5
processor : 6
processor : 7
#### Container
brauner@wittgenstein|~
> lxc exec f1 -- grep ^processor /proc/cpuinfo
processor : 0
processor : 1
## top
#### Host
top - 13:16:47 up 15:54, 39 users, load average: 0,76, 0,47, 0,40
Tasks: 434 total, 1 running, 433 sleeping, 0 stopped, 0 zombie
%Cpu0 : 2,7 us, 2,4 sy, 0,0 ni, 94,5 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st
%Cpu1 : 3,3 us, 1,3 sy, 0,0 ni, 95,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
%Cpu2 : 1,6 us, 9,1 sy, 0,0 ni, 89,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
%Cpu3 : 2,3 us, 1,3 sy, 0,0 ni, 96,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
%Cpu4 : 2,7 us, 1,7 sy, 0,0 ni, 95,7 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
%Cpu5 : 2,9 us, 2,9 sy, 0,0 ni, 94,1 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
%Cpu6 : 2,3 us, 1,0 sy, 0,0 ni, 96,3 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st
%Cpu7 : 3,3 us, 1,3 sy, 0,0 ni, 95,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
#### Container
top - 11:16:13 up 2:08, 0 users, load average: 0.27, 0.36, 0.36
Tasks: 24 total, 1 running, 23 sleeping, 0 stopped, 0 zombie
%Cpu0 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu1 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
On Tue, Jun 15, 2021 at 01:32:22PM +0200, Christian Brauner wrote:
> On Thu, Jun 03, 2021 at 12:43:07PM +0200, [email protected] wrote:
> > From: Alexey Gladkov <[email protected]>
> >
> > The /proc/meminfo contains information regardless of the cgroups
> > restrictions. This file is still widely used [1]. This means that all
> > these programs will not work correctly inside container [2][3][4]. Some
> > programs try to respect the cgroups limits, but not all of them
> > implement support for all cgroup versions [5].
> >
> > Correct information can be obtained from cgroups, but this requires the
> > cgroups to be available inside container and the correct version of
> > cgroups to be supported.
> >
> > There is lxcfs [6] that emulates /proc/meminfo using fuse to provide
> > information regarding cgroups. This patch can help them.
> >
> > This patch adds /proc/self/meminfo that contains a subset of
> > /proc/meminfo respecting cgroup restrictions.
> >
> > We cannot just create /proc/self/meminfo and make a symlink at the old
> > location because this will break the existing apparmor rules [7].
> > Therefore, the patch adds a separate file with the same format.
>
> Interesting work. Thanks. This is basically a variant of what I
> suggested at Plumbers and in [1].
I made the second version of the patch [1], but then I had a conversation
with Eric W. Biederman offlist. He convinced me that it is a bad idea to
change all the values in meminfo to accommodate cgroups. But we agreed
that MemAvailable in /proc/meminfo should respect cgroups limits. This
field was created to hide implementation details when calculating
available memory. You can see that it is quite widely used [2].
So I want to try to move in that direction.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0
[2] https://codesearch.debian.net/search?q=MemAvailable%3A
> Judging from the patches sent by Waiman Long in [2] to also virtualize
> /proc/cpuinfo and /sys/devices/system/cpu this is a larger push to
> provide virtualized system information to containers.
>
> Although somewhere in the thread here this veered off into apparently
> just being a way for a process to gather information about it's own
> resources. At which point I'm confused why looking at its cgroups
> isn't enough.
I think it's not enough. As an example:
$ mount -t cgroup2 none /sys/fs/cgroup
$ echo +memory > /sys/fs/cgroup/cgroup.subtree_control
$ mkdir /sys/fs/cgroup/mem0
$ echo +memory > /sys/fs/cgroup/mem0/cgroup.subtree_control
$ mkdir /sys/fs/cgroup/mem0/mem1
$ echo $$ > /sys/fs/cgroup/mem0/mem1/cgroup.procs
I didn't set a limit and just added the shell to the group.
$ cat /proc/self/cgroup
0::/mem0/mem1
$ cat /sys/fs/cgroup/mem0/mem1/memory.max
max
$ cat /sys/fs/cgroup/mem0/memory.max
max
In this case we need to use MemAvailable from /proc/meminfo.
Another example:
$ mount -t cgroup2 none /sys/fs/cgroup
$ echo +memory > /sys/fs/cgroup/cgroup.subtree_control
$ mkdir /sys/fs/cgroup/mem0
$ echo $(( 3 * 1024 * 1024 )) > /sys/fs/cgroup/mem0/memory.max
$ echo +memory > /sys/fs/cgroup/mem0/cgroup.subtree_control
$ mkdir /sys/fs/cgroup/mem0/mem1
$ echo $(( 3 * 1024 * 1024 * 1024 * 1024 )) > /sys/fs/cgroup/mem0/mem1/memory.max
$ echo $$ > /sys/fs/cgroup/mem0/mem1/cgroup.procs
$ head -3 /proc/meminfo
MemTotal: 1002348 kB
MemFree: 972712 kB
MemAvailable: 968100 kB
$ cat /sys/fs/cgroup/mem0{,/mem1}/memory.max
3145728
3298534883328
Now, I have cgroup limits, but you can write absolutely any value as a
limit. So how much memory is available to shell in this case? To get this
value, you need to take the minimum of MemAvailable and **/memory.max.
... or I fundamentally don't understand something.
> So /proc/self/meminfo seems to just be the start. And note the two
> approaches seem to diverge too. This provides a new file while the other
> patchset virtualizes existing proc files/folders.
>
> In any case it seems you might want to talk since afaict you're all at
> the same company but don't seem to be aware of each others work (Which
> happens of course.).
>
> For the sake of history such patchsets have been pushed for before by
> the Siteground people.
>
> Chris and Johannes made a good point that the information provided in
> this file can be gathered from cgroups already. So applications should
> probably switch to reading those out of their cgroup and most are doing
> that already.
>
> And reading values out of cgroups is pretty straightforward even with
> the differences between cgroup v1 and v2. Userspace is doing it all over
> the place all of the time and the code has now existed for years so the
> cgroup interface is a problem. And with cgroup v2 it keeps growing so
> much more useful metrics that looking at meminfo isn't really cutting it
> anyway.
>
> So I think the argument that applications should start looking at their
> cgroup info if they want to find out detailed info is a solid argument
> that shouldn't be easily brushed aside.
>
> What might be worth is knowing exactly what applications are looking at
> /proc/meminfo and /proc/cpuinfo and make decision based on that info.
> None of that is clearly outlined in the thread unfortunately.
>
> So I immediately see two types of applications that could benefit from
> this patchset. The first ones are legacy applications that aren't aware
> of cgroups and aren't actively maintained. Introducing such
> functionality for these applications seems a weak argument.
>
> The second type is new and maintained applications that look at global
> info such as /proc/meminfo and /proc/cpuinfo. So such applications have
> ignored cgroups for a decade now. This makes it very unconvincing that
> they will suddenly switch to a newly introduced file. Especially if the
> entries in a new file aren't a 1:1 mapping of the old file.
>
> Johannes made another good point about it not being clear what
> applications actually want. And he's very right in that. It seems
> straightforward to virtualize things like meminfo but it actually isn't.
> And it's something you quite often discover after the fact. We have
> extensive experience implementing it in LXCFS in userspace. People kept
> and keep arguing what information exactly is supposed to go into
> calculating those values based on what best helps their use-case.
>
> Swap was an especially contentious point. In fact, sometimes users want
> to turn of swap even though it exists on the host and there's a command
> line switch in LXCFS to control that behavior.
>
> Another example supporting Johannes worry is virtualizing /proc/cpuinfo
> where some people wanted to virtualize cpu counts based on cpu shares.
> So we have two modes to virtualize cpus: based on cpuset alone or based
> on cpuset and cpu shares. And both modes are actively used. And that all
> really depends on application and workload.
>
> Finally, although LXCFS is briefly referenced in the commit message but
> it isn't explained very well and what it does.
>
> And we should consider it since this is a full existing userspace
> solution to the problem solved in this patchset including Dan's JRE
> use-case.
>
> This is a project started in 2014 and it is in production use since 2014
> and it delivers the features of this patchset here and more.
>
> For example, it's used in the Linux susbystem of Chromebooks, it's used
> by Alibaba (see [3]) and it is used for the JRE use-case by Google's
> Anthos when migrating such legacy applications (see [4]).
>
> At first, I was convinced we could make use of /proc/self/meminfo in
> LXCFS which is why I held back but we can't. We can't simply bind-mount
> it over /proc/meminfo because it's not a 1:1 correspondence between all
> fields. We could potentially read some values we now calculate and
> display it in /proc/meminfo but we can't stop virtualizing /proc/meminfo
> itself. So we don't gain anything from this. When Alex asked me about it
> I tried to come up with good ways to integrate this but the gain is just
> too little for us.
>
> Because our experience tells us that applications that want this type of
> virtualization don't really care about heir own resources. They care
> about a virtualized view of the system's resources. And the system in
> question is often a container. But it get's very tricky since we don't
> really define what a container is. So what data the user wants to see
> depends on the used container runtime, type of container, and workload.
> An application container has very different needs than a system
> container that boots systemd. LXCFS can be very flexible here and
> virtualize according to the users preferences (see the split between
> cpuset and cpuset + cpu shares virtualization for cpu counts).
>
> In any case, LXCFS is a tiny FUSE filesystem which virtualizes various
> procfs and sysfs files for a container:
>
> /proc/cpuinfo
> /proc/diskstats
> /proc/meminfo
> /proc/stat
> /proc/swaps
> /proc/uptime
> /proc/slabinfo
> /sys/devices/system/cpu/*
> /sys/devices/system/cpu/online
>
> If you call top in a container that makes use of this it will display
> everything virtualized to the container (See [5] for an example of
> /proc/cpuinfo and /sys/devices/system/cpu/*.). And JRE will not
> overallocate resources. It's actively used for all of that.
>
> Below at [5] you can find an example where 2 cpus out of 8 have been
> assigned to the container's cpuset. The container values are virtualized
> as you can see.
>
> [1]: https://lkml.org/lkml/2020/6/4/951
> [2]: https://lore.kernel.org/lkml/YMe/[email protected]
> [3]: https://www.alibabacloud.com/blog/kubernetes-demystified-using-lxcfs-to-improve-container-resource-visibility_594109
> [4]: https://cloud.google.com/blog/products/containers-kubernetes/migrate-for-anthos-streamlines-legacy-java-app-modernization
> [5]: ## /proc/cpuinfo
> #### Host
> brauner@wittgenstein|~
> > ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]]
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu0
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu1
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu2
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu3
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu4
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu5
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu6
> drwxr-xr-x 10 root root 0 Jun 14 21:22 cpu7
>
> #### Container
> brauner@wittgenstein|~
> > lxc exec f1 -- ls -al /sys/devices/system/cpu/ | grep cpu[[:digit:]]
> drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu3
> drwxr-xr-x 2 nobody nogroup 0 Jun 15 10:22 cpu4
>
> ## /sys/devices/system/cpu/*
> #### Host
> brauner@wittgenstein|~
> > grep ^processor /proc/cpuinfo
> processor : 0
> processor : 1
> processor : 2
> processor : 3
> processor : 4
> processor : 5
> processor : 6
> processor : 7
>
> #### Container
> brauner@wittgenstein|~
> > lxc exec f1 -- grep ^processor /proc/cpuinfo
> processor : 0
> processor : 1
>
> ## top
> #### Host
> top - 13:16:47 up 15:54, 39 users, load average: 0,76, 0,47, 0,40
> Tasks: 434 total, 1 running, 433 sleeping, 0 stopped, 0 zombie
> %Cpu0 : 2,7 us, 2,4 sy, 0,0 ni, 94,5 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st
> %Cpu1 : 3,3 us, 1,3 sy, 0,0 ni, 95,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
> %Cpu2 : 1,6 us, 9,1 sy, 0,0 ni, 89,3 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
> %Cpu3 : 2,3 us, 1,3 sy, 0,0 ni, 96,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
> %Cpu4 : 2,7 us, 1,7 sy, 0,0 ni, 95,7 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
> %Cpu5 : 2,9 us, 2,9 sy, 0,0 ni, 94,1 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
> %Cpu6 : 2,3 us, 1,0 sy, 0,0 ni, 96,3 id, 0,0 wa, 0,0 hi, 0,3 si, 0,0 st
> %Cpu7 : 3,3 us, 1,3 sy, 0,0 ni, 95,4 id, 0,0 wa, 0,0 hi, 0,0 si, 0,0 st
>
> #### Container
> top - 11:16:13 up 2:08, 0 users, load average: 0.27, 0.36, 0.36
> Tasks: 24 total, 1 running, 23 sleeping, 0 stopped, 0 zombie
> %Cpu0 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
> %Cpu1 : 0.0 us, 0.0 sy, 0.0 ni,100.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
>
--
Rgrds, legion
On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <[email protected]> wrote:
>
[...]
>
> I made the second version of the patch [1], but then I had a conversation
> with Eric W. Biederman offlist. He convinced me that it is a bad idea to
> change all the values in meminfo to accommodate cgroups. But we agreed
> that MemAvailable in /proc/meminfo should respect cgroups limits. This
> field was created to hide implementation details when calculating
> available memory. You can see that it is quite widely used [2].
> So I want to try to move in that direction.
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0
> [2] https://codesearch.debian.net/search?q=MemAvailable%3A
>
Please see following two links on the previous discussion on having
per-memcg MemAvailable stat.
[1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/
[2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/
MemAvailable itself is an imprecise metric and involving memcg makes
this metric even more weird. The difference of semantics of swap
accounting of v1 and v2 is one source of this weirdness (I have not
checked your patch if it is handling this weirdness). The lazyfree and
deferred split pages are another source.
So, I am not sure if complicating an already imprecise metric will
make it more useful.
Shakeel Butt <[email protected]> writes:
> On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <[email protected]> wrote:
>>
> [...]
>>
>> I made the second version of the patch [1], but then I had a conversation
>> with Eric W. Biederman offlist. He convinced me that it is a bad idea to
>> change all the values in meminfo to accommodate cgroups. But we agreed
>> that MemAvailable in /proc/meminfo should respect cgroups limits. This
>> field was created to hide implementation details when calculating
>> available memory. You can see that it is quite widely used [2].
>> So I want to try to move in that direction.
>>
>> [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0
>> [2] https://codesearch.debian.net/search?q=MemAvailable%3A
>>
>
> Please see following two links on the previous discussion on having
> per-memcg MemAvailable stat.
>
> [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/
> [2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/
>
> MemAvailable itself is an imprecise metric and involving memcg makes
> this metric even more weird. The difference of semantics of swap
> accounting of v1 and v2 is one source of this weirdness (I have not
> checked your patch if it is handling this weirdness). The lazyfree and
> deferred split pages are another source.
>
> So, I am not sure if complicating an already imprecise metric will
> make it more useful.
Making a good guess at how much memory can be allocated without
triggering swapping or otherwise stressing the system is something that
requires understanding our mm internals.
To be able to continue changing the mm or even mm policy without
introducing regressions in userspace we need to export values that
userspace can use.
At a first approximation that seems to look like MemAvailable.
MemAvailable seems to have a good definition. Roughly the amount of
memory that can be allocated without triggering swapping. Updated
to include not trigger memory cgroup based swapping and I sounds good.
I don't know if it will work in practice but I think it is worth
exploring.
I do know that hiding the implementation details and providing userspace
with information it can directly use seems like the programming model
that needs to be explored. Most programs should not care if they are in
a memory cgroup, etc. Programs, load management systems, and even
balloon drivers have a legitimately interest in how much additional load
can be placed on a systems memory.
A version of this that I remember working fairly well is free space
on compressed filesystems. As I recall compressed filesystems report
the amount of uncompressed space that is available (an underestimate).
This results in the amount of space consumed going up faster than the
free space goes down.
We can't do exactly the same thing with our memory usability estimate,
but having our estimate be a reliable underestimate might be enough
to avoid problems with reporting too much memory as available to
userspace.
I know that MemAvailable already does that /2 so maybe it is already
aiming at being an underestimate. Perhaps we need some additional
accounting to help create a useful metric for userspace as well.
I don't know the final answer. I do know that not designing an
interface that userspace can use to deal with it's legitimate concerns
is sticking our collective heads in the sand and wishing the problem
will go away.
Eric
On Wed 16-06-21 11:17:38, Eric W. Biederman wrote:
[...]
> MemAvailable seems to have a good definition. Roughly the amount of
> memory that can be allocated without triggering swapping. Updated
> to include not trigger memory cgroup based swapping and I sounds good.
yes this definition is at least understandable but how do you want to
define it in the memcg scope? There are two different source of memory
pressure when dealing with memcgs. Internal one when a limit is hit and
and external when the source of the reclaim comes from higher the
hierarchy (including the global memory pressure). The former one would
be quite easy to mimic with the global semantic but the later will get
much more complex very quickly - a) you would need a snapshot of the
whole cgroup tree and evaluate it against the global memory state b) you
would have to consider memory reclaim protection c) the external memory
pressure is distributed proportionaly to the size most of the time which
is yet another complication. And more other challenges that have been
already discussed.
That being said, this might be possible to implement but I am not really
sure this is viable and I strongly suspect that it will get unreliable
in many situations in context of "how much you can allocate without
swapping".
--
Michal Hocko
SUSE Labs
On Wed, Jun 16, 2021 at 9:17 AM Eric W. Biederman <[email protected]> wrote:
>
> Shakeel Butt <[email protected]> writes:
>
> > On Tue, Jun 15, 2021 at 5:47 AM Alexey Gladkov <[email protected]> wrote:
> >>
> > [...]
> >>
> >> I made the second version of the patch [1], but then I had a conversation
> >> with Eric W. Biederman offlist. He convinced me that it is a bad idea to
> >> change all the values in meminfo to accommodate cgroups. But we agreed
> >> that MemAvailable in /proc/meminfo should respect cgroups limits. This
> >> field was created to hide implementation details when calculating
> >> available memory. You can see that it is quite widely used [2].
> >> So I want to try to move in that direction.
> >>
> >> [1] https://git.kernel.org/pub/scm/linux/kernel/git/legion/linux.git/log/?h=patchset/meminfo/v2.0
> >> [2] https://codesearch.debian.net/search?q=MemAvailable%3A
> >>
> >
> > Please see following two links on the previous discussion on having
> > per-memcg MemAvailable stat.
> >
> > [1] https://lore.kernel.org/linux-mm/alpine.DEB.2.22.394.2006281445210.855265@chino.kir.corp.google.com/
> > [2] https://lore.kernel.org/linux-mm/alpine.DEB.2.23.453.2007142018150.2667860@chino.kir.corp.google.com/
> >
> > MemAvailable itself is an imprecise metric and involving memcg makes
> > this metric even more weird. The difference of semantics of swap
> > accounting of v1 and v2 is one source of this weirdness (I have not
> > checked your patch if it is handling this weirdness). The lazyfree and
> > deferred split pages are another source.
> >
> > So, I am not sure if complicating an already imprecise metric will
> > make it more useful.
>
> Making a good guess at how much memory can be allocated without
> triggering swapping or otherwise stressing the system is something that
> requires understanding our mm internals.
>
> To be able to continue changing the mm or even mm policy without
> introducing regressions in userspace we need to export values that
> userspace can use.
The issue is the dependence of such exported values on mm internals.
MM internal code and policy changes will change this value and there
is a potential of userspace regression.
>
> At a first approximation that seems to look like MemAvailable.
>
> MemAvailable seems to have a good definition. Roughly the amount of
> memory that can be allocated without triggering swapping.
Nowadays, I don't think MemAvailable giving "amount of memory that can
be allocated without triggering swapping" is even roughly accurate.
Actually IMO "without triggering swap" is not something an application
should concern itself with where refaults from some swap types
(zswap/swap-on-zram) are much faster than refaults from disk.
> Updated
> to include not trigger memory cgroup based swapping and I sounds good.
>
> I don't know if it will work in practice but I think it is worth
> exploring.
I agree.
>
> I do know that hiding the implementation details and providing userspace
> with information it can directly use seems like the programming model
> that needs to be explored. Most programs should not care if they are in
> a memory cgroup, etc. Programs, load management systems, and even
> balloon drivers have a legitimately interest in how much additional load
> can be placed on a systems memory.
>
How much additional load can be placed on a system *until what*. I
think we should focus more on the "until" part to make the problem
more tractable.
>
> A version of this that I remember working fairly well is free space
> on compressed filesystems. As I recall compressed filesystems report
> the amount of uncompressed space that is available (an underestimate).
> This results in the amount of space consumed going up faster than the
> free space goes down.
>
> We can't do exactly the same thing with our memory usability estimate,
> but having our estimate be a reliable underestimate might be enough
> to avoid problems with reporting too much memory as available to
> userspace.
>
> I know that MemAvailable already does that /2 so maybe it is already
> aiming at being an underestimate. Perhaps we need some additional
> accounting to help create a useful metric for userspace as well.
>
The real challenge here is that we are not 100% sure if a page is
reclaimable until we try to reclaim it. For example we might have file
lrus filled with lazyfree pages which might have been accessed.
MemAvailable will show half the size of file lrus but once we try to
reclaim them, we have to move them back to anon lru and drastic drop
in MemAvailable.
>
> I don't know the final answer. I do know that not designing an
> interface that userspace can use to deal with it's legitimate concerns
> is sticking our collective heads in the sand and wishing the problem
> will go away.
I am a bit skeptical that a single interface would be enough but first
we should formalize what exactly the application wants with some
concrete use-cases. More specifically, are the applications interested
in avoiding swapping or OOM or stall?
Second, is the reactive approach acceptable? Instead of an upfront
number representing the room for growth, how about just grow and
backoff when some event (oom or stall) which we want to avoid is about
to happen? This is achievable today for oom and stall with PSI and
memory.high and it avoids the hard problem of reliably estimating the
reclaimable memory.
On 19.06.21 01:38, Shakeel Butt wrote:
> Nowadays, I don't think MemAvailable giving "amount of memory that can
> be allocated without triggering swapping" is even roughly accurate.
> Actually IMO "without triggering swap" is not something an application
> should concern itself with where refaults from some swap types
> (zswap/swap-on-zram) are much faster than refaults from disk.
If we're talking about things like database workloads, there IMHO isn't
anything really better than doing measurements with the actual loads
and tuning incrementally.
But: what is the actual optimization goal, why an application might
want to know where swapping begins ? Computing performance ? Caching +
IO Latency or throughput ? Network traffic (e.g. w/ iscsi) ? Power
consumption ?
>> I do know that hiding the implementation details and providing userspace
>> with information it can directly use seems like the programming model
>> that needs to be explored. Most programs should not care if they are in
>> a memory cgroup, etc. Programs, load management systems, and even
>> balloon drivers have a legitimately interest in how much additional load
>> can be placed on a systems memory.
What kind of load exactly ? CPU ? disk IO ? network ?
> How much additional load can be placed on a system *until what*. I
> think we should focus more on the "until" part to make the problem
> more tractable.
ACK. The interesting question is what to do in that case.
An obvious move by an database system could be eg. filling only so much
caches as there's spare physical RAM, in order to avoid useless swapping
(since we'd potentiall produce more IO load when a cache is written
out to swap, instead of just discarding it)
But, this also depends ...
#1: the application doesn't know the actual performance of the swap
device, eg. the already mentioned zswap+friends, or some fast nvmem
for swap vs disk for storage.
#2: caches might also be implemented indirectly by mmap()ing the storage
file/device and so using the kernel's cache here. in that case, the
kernel would automatically discard the pages w/o going to swap. of
course that only works if the cache is nothing but copying pages from
storage into ram.
A completely different scenario would be load management on a cluster
like k8s. Here we usually care of cluster performance (dont care about
individual nodes so muck), but wanna prevent individual nodes from being
overloaded. Since we usually don't know much about the indivdual
workload, we probably don't have much other chance than contigous
monitoring and acting when a node is getting too busy - or trying to
balance when new workloads are started, on current system load (and
other metrics). In that case, I don't see where this new proc file
should be of much help.
> Second, is the reactive approach acceptable? Instead of an upfront
> number representing the room for growth, how about just grow and
> backoff when some event (oom or stall) which we want to avoid is about
> to happen? This is achievable today for oom and stall with PSI and
> memory.high and it avoids the hard problem of reliably estimating the
> reclaimable memory.
I tend to believe that for certain use cases it would be helpful if an
application gets notified if some of its pages are soon getting swapped
out due memory pressure. Then it could decide on its own which whether
it should drop certain caches in order to prevent swapping.
--mtx
--
---
Hinweis: unverschlüsselte E-Mails können leicht abgehört und manipuliert
werden ! Für eine vertrauliche Kommunikation senden Sie bitte ihren
GPG/PGP-Schlüssel zu.
---
Enrico Weigelt, metux IT consult
Free software and Linux embedded engineering
[email protected] -- +49-151-27565287