This patchset does a few things to aid in OOM debugging, in particular
when shrinkers are involved:
- improves the show_mem OOM report: it now reports on shrinkers, and
for both shrinkers and slab we only report the top 10 entries,
sorted, not the full list
- add shrinker_to_text(), for the show_mem report and debugfs, and a
an optional shrinker.to_text() callback to report extra
driver-specific information
- add extra counters for the shrinker_to_text() report
- implement shrinker.to_text() for bcachefs, giving an example of how
to use the callbacks
Daniel Hill (1):
bcachefs: add counters for failed shrinker reclaim
Kent Overstreet (6):
seq_buf: seq_buf_human_readable_u64()
mm: shrinker: Add a .to_text() method for shrinkers
mm: shrinker: Add new stats for .to_text()
mm: Centralize & improve oom reporting in show_mem.c
mm: shrinker: Add shrinker_to_text() to debugfs interface
bcachefs: shrinker.to_text() methods
fs/bcachefs/btree_cache.c | 102 +++++++++++++++++++++++++++-------
fs/bcachefs/btree_cache.h | 2 +-
fs/bcachefs/btree_key_cache.c | 14 +++++
fs/bcachefs/btree_types.h | 10 ++++
fs/bcachefs/sysfs.c | 2 +-
include/linux/seq_buf.h | 4 ++
include/linux/shrinker.h | 12 +++-
lib/seq_buf.c | 10 ++++
mm/oom_kill.c | 23 --------
mm/show_mem.c | 20 +++++++
mm/shrinker.c | 89 ++++++++++++++++++++++++++++-
mm/shrinker_debug.c | 18 ++++++
mm/slab.h | 6 +-
mm/slab_common.c | 52 ++++++++++++++---
14 files changed, 305 insertions(+), 59 deletions(-)
--
2.42.0
This adds a seq_buf wrapper for string_get_size().
Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/seq_buf.h | 4 ++++
lib/seq_buf.c | 10 ++++++++++
2 files changed, 14 insertions(+)
diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index 5fb1f12c33f9..dfcd0f367d6a 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -171,4 +171,8 @@ seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary);
void seq_buf_do_printk(struct seq_buf *s, const char *lvl);
+enum string_size_units;
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v,
+ const enum string_size_units units);
+
#endif /* _LINUX_SEQ_BUF_H */
diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 010c730ca7fc..9d4e4d5f43b4 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -425,3 +425,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type,
}
return 0;
}
+
+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v, const enum string_size_units units)
+{
+ char *buf;
+ size_t size = seq_buf_get_buf(s, &buf);
+ int wrote = string_get_size(v, 1, units, buf, size);
+
+ seq_buf_commit(s, wrote);
+}
+EXPORT_SYMBOL(seq_buf_human_readable_u64);
--
2.42.0
This adds a new callback method to shrinkers which they can use to
describe anything relevant to memory reclaim about their internal state,
for example object dirtyness.
This patch also adds shrinkers_to_text(), which reports on the top 10
shrinkers - by object count - in sorted order, to be used in OOM
reporting.
Cc: Andrew Morton <[email protected]>
Cc: Qi Zheng <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: [email protected]
Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/shrinker.h | 6 +++-
mm/shrinker.c | 73 +++++++++++++++++++++++++++++++++++++++-
2 files changed, 77 insertions(+), 2 deletions(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1a00be90d93a..968c55474e78 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -24,6 +24,8 @@ struct shrinker_info {
struct shrinker_info_unit *unit[];
};
+struct seq_buf;
+
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extension later.
@@ -80,10 +82,12 @@ struct shrink_control {
* @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
+ const char *name;
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
+ void (*to_text)(struct seq_buf *, struct shrinker *);
long batch; /* reclaim batch size, 0 = default */
int seeks; /* seeks to recreate an obj */
@@ -110,7 +114,6 @@ struct shrinker {
#endif
#ifdef CONFIG_SHRINKER_DEBUG
int debugfs_id;
- const char *name;
struct dentry *debugfs_entry;
#endif
/* objs pending delete, per node */
@@ -135,6 +138,7 @@ __printf(2, 3)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
void shrinker_register(struct shrinker *shrinker);
void shrinker_free(struct shrinker *shrinker);
+void shrinkers_to_text(struct seq_buf *);
static inline bool shrinker_try_get(struct shrinker *shrinker)
{
diff --git a/mm/shrinker.c b/mm/shrinker.c
index dd91eab43ed3..4976dbac4c83 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -1,8 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
+#include <linux/rculist.h>
#include <linux/rwsem.h>
+#include <linux/seq_buf.h>
#include <linux/shrinker.h>
-#include <linux/rculist.h>
#include <trace/events/vmscan.h>
#include "internal.h"
@@ -807,3 +808,73 @@ void shrinker_free(struct shrinker *shrinker)
call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);
+
+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
+{
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+
+ seq_buf_puts(out, shrinker->name);
+ seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+
+ if (shrinker->to_text) {
+ shrinker->to_text(out, shrinker);
+ seq_buf_puts(out, "\n");
+ }
+}
+
+/**
+ * shrinkers_to_text - Report on shrinkers with highest usage
+ *
+ * This reports on the top 10 shrinkers, by object counts, in sorted order:
+ * intended to be used for OOM reporting.
+ */
+void shrinkers_to_text(struct seq_buf *out)
+{
+ struct shrinker *shrinker;
+ struct shrinker_by_mem {
+ struct shrinker *shrinker;
+ unsigned long mem;
+ } shrinkers_by_mem[10];
+ int i, nr = 0;
+
+ if (!mutex_trylock(&shrinker_mutex)) {
+ seq_buf_puts(out, "(couldn't take shrinker lock)");
+ return;
+ }
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ unsigned long mem = shrinker->count_objects(shrinker, &sc);
+
+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
+ continue;
+
+ for (i = 0; i < nr; i++)
+ if (mem < shrinkers_by_mem[i].mem)
+ break;
+
+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
+ memmove(&shrinkers_by_mem[i + 1],
+ &shrinkers_by_mem[i],
+ sizeof(shrinkers_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&shrinkers_by_mem[0],
+ &shrinkers_by_mem[1],
+ sizeof(shrinkers_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ shrinkers_by_mem[i] = (struct shrinker_by_mem) {
+ .shrinker = shrinker,
+ .mem = mem,
+ };
+ }
+
+ for (i = nr - 1; i >= 0; --i)
+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
+
+ mutex_unlock(&shrinker_mutex);
+}
--
2.42.0
Add a few new shrinker stats.
number of objects requested to free, number of objects freed:
Shrinkers won't necessarily free all objects requested for a variety of
reasons, but if the two counts are wildly different something is likely
amiss.
.scan_objects runtime:
If one shrinker is taking an excessive amount of time to free
objects that will block kswapd from running other shrinkers.
Cc: Andrew Morton <[email protected]>
Cc: Qi Zheng <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: [email protected]
Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/shrinker.h | 5 +++++
mm/shrinker.c | 18 +++++++++++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 968c55474e78..497a7e8e348c 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -118,6 +118,11 @@ struct shrinker {
#endif
/* objs pending delete, per node */
atomic_long_t *nr_deferred;
+
+ atomic_long_t objects_requested_to_free;
+ atomic_long_t objects_freed;
+ atomic_long_t last_freed;
+ atomic64_t ns_run;
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 4976dbac4c83..acfc3f92f552 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -430,13 +430,20 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
+ u64 start_time = ktime_get_ns();
+
+ atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free);
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
+
+ atomic64_add(ktime_get_ns() - start_time, &shrinker->ns_run);
if (ret == SHRINK_STOP)
break;
freed += ret;
+ atomic_long_add(ret, &shrinker->objects_freed);
+ atomic_long_set(&shrinker->last_freed, ret);
count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
total_scan -= shrinkctl->nr_scanned;
@@ -812,9 +819,18 @@ EXPORT_SYMBOL_GPL(shrinker_free);
void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
{
struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ unsigned long nr_freed = atomic_long_read(&shrinker->objects_freed);
seq_buf_puts(out, shrinker->name);
- seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+ seq_buf_putc(out, '\n');
+
+ seq_buf_printf(out, "objects: %lu", shrinker->count_objects(shrinker, &sc));
+ seq_buf_printf(out, "requested to free: %lu", atomic_long_read(&shrinker->objects_requested_to_free));
+ seq_buf_printf(out, "objects freed: %lu", nr_freed);
+ seq_buf_printf(out, "last freed: %lu", atomic_long_read(&shrinker->last_freed));
+ seq_buf_printf(out, "ns per object freed: %llu", nr_freed
+ ? div64_ul(atomic64_read(&shrinker->ns_run), nr_freed)
+ : 0);
if (shrinker->to_text) {
shrinker->to_text(out, shrinker);
--
2.42.0
This adds shrinker.to_text() methods for our shrinkers and hooks them up
to our existing to_text() functions.
Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/btree_cache.c | 13 +++++++++++++
fs/bcachefs/btree_key_cache.c | 14 ++++++++++++++
2 files changed, 27 insertions(+)
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 47e7770d0583..aab279dff944 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -13,6 +13,7 @@
#include <linux/prefetch.h>
#include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
@@ -392,6 +393,17 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
return btree_cache_can_free(bc);
}
+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+ struct bch_fs *c = shrink->private_data;
+ char *cbuf;
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+ bch2_btree_cache_to_text(&out, c);
+ seq_buf_commit(s, out.pos);
+}
+
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
@@ -478,6 +490,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bc->shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->to_text = bch2_btree_cache_shrinker_to_text;
shrink->seeks = 4;
shrink->private_data = c;
shrinker_register(shrink);
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 4402cf068c56..e14e9b4cd029 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -13,6 +13,7 @@
#include "trace.h"
#include <linux/sched/mm.h>
+#include <linux/seq_buf.h>
static inline bool btree_uses_pcpu_readers(enum btree_id id)
{
@@ -1028,6 +1029,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
INIT_LIST_HEAD(&c->freed_nonpcpu);
}
+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
+{
+ struct bch_fs *c = shrink->private_data;
+ struct btree_key_cache *bc = &c->btree_key_cache;
+ char *cbuf;
+ size_t buflen = seq_buf_get_buf(s, &cbuf);
+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
+
+ bch2_btree_key_cache_to_text(&out, bc);
+ seq_buf_commit(s, out.pos);
+}
+
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
@@ -1051,6 +1064,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->to_text = bch2_btree_key_cache_shrinker_to_text;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
--
2.42.0
Previously, we added shrinker_to_text() and hooked it up to the OOM
report - now, the same report is available via debugfs.
Signed-off-by: Kent Overstreet <[email protected]>
---
include/linux/shrinker.h | 1 +
mm/shrinker_debug.c | 18 ++++++++++++++++++
2 files changed, 19 insertions(+)
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 497a7e8e348c..b8e57afabbcc 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -143,6 +143,7 @@ __printf(2, 3)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
void shrinker_register(struct shrinker *shrinker);
void shrinker_free(struct shrinker *shrinker);
+void shrinker_to_text(struct seq_buf *, struct shrinker *);
void shrinkers_to_text(struct seq_buf *);
static inline bool shrinker_try_get(struct shrinker *shrinker)
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 12ea5486a3e9..39342aa9f4ca 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -2,6 +2,7 @@
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
+#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/shrinker.h>
#include <linux/memcontrol.h>
@@ -159,6 +160,21 @@ static const struct file_operations shrinker_debugfs_scan_fops = {
.write = shrinker_debugfs_scan_write,
};
+static int shrinker_debugfs_report_show(struct seq_file *m, void *v)
+{
+ struct shrinker *shrinker = m->private;
+ char *bufp;
+ size_t buflen = seq_get_buf(m, &bufp);
+ struct seq_buf out;
+
+ seq_buf_init(&out, bufp, buflen);
+ shrinker_to_text(&out, shrinker);
+ seq_commit(m, seq_buf_used(&out));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_report);
+
int shrinker_debugfs_add(struct shrinker *shrinker)
{
struct dentry *entry;
@@ -190,6 +206,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker)
&shrinker_debugfs_count_fops);
debugfs_create_file("scan", 0220, entry, shrinker,
&shrinker_debugfs_scan_fops);
+ debugfs_create_file("report", 0440, entry, shrinker,
+ &shrinker_debugfs_report_fops);
return 0;
}
--
2.42.0
This patch:
- Changes show_mem() to always report on slab usage
- Instead of reporting on all slabs, we only report on top 10 slabs,
and in sorted order
- Also reports on shrinkers, with the new shrinkers_to_text().
Shrinkers need to be included in OOM/allocation failure reporting
because they're responsible for memory reclaim - if a shrinker isn't
giving up its memory, we need to know which one and why.
More OOM reporting can be moved to show_mem.c and improved, this patch
is only a start.
New example output on OOM/memory allocation failure:
00177 Mem-Info:
00177 active_anon:13706 inactive_anon:32266 isolated_anon:16
00177 active_file:1653 inactive_file:1822 isolated_file:0
00177 unevictable:0 dirty:0 writeback:0
00177 slab_reclaimable:6242 slab_unreclaimable:11168
00177 mapped:3824 shmem:3 pagetables:1266 bounce:0
00177 kernel_misc_reclaimable:0
00177 free:4362 free_pcp:35 free_cma:0
00177 Node 0 active_anon:54824kB inactive_anon:129064kB active_file:6612kB inactive_file:7288kB unevictable:0kB isolated(anon):64kB isolated(file):0kB mapped:15296kB dirty:0kB writeback:0kB shmem:12kB writeback_tmp:0kB kernel_stack:3392kB pagetables:5064kB all_unreclaimable? no
00177 DMA free:2232kB boost:0kB min:88kB low:108kB high:128kB reserved_highatomic:0KB active_anon:2924kB inactive_anon:6596kB active_file:428kB inactive_file:384kB unevictable:0kB writepending:0kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 426 426 426
00177 DMA32 free:15092kB boost:5836kB min:8432kB low:9080kB high:9728kB reserved_highatomic:0KB active_anon:52196kB inactive_anon:122392kB active_file:6176kB inactive_file:7068kB unevictable:0kB writepending:0kB present:507760kB managed:441816kB mlocked:0kB bounce:0kB free_pcp:72kB local_pcp:0kB free_cma:0kB
00177 lowmem_reserve[]: 0 0 0 0
00177 DMA: 284*4kB (UM) 53*8kB (UM) 21*16kB (U) 11*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 2248kB
00177 DMA32: 2765*4kB (UME) 375*8kB (UME) 57*16kB (UM) 5*32kB (U) 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 15132kB
00177 4656 total pagecache pages
00177 1031 pages in swap cache
00177 Swap cache stats: add 6572399, delete 6572173, find 488603/3286476
00177 Free swap = 509112kB
00177 Total swap = 2097148kB
00177 130938 pages RAM
00177 0 pages HighMem/MovableOnly
00177 16644 pages reserved
00177 Unreclaimable slab info:
00177 9p-fcall-cache total: 8.25 MiB active: 8.25 MiB
00177 kernfs_node_cache total: 2.15 MiB active: 2.15 MiB
00177 kmalloc-64 total: 2.08 MiB active: 2.07 MiB
00177 task_struct total: 1.95 MiB active: 1.95 MiB
00177 kmalloc-4k total: 1.50 MiB active: 1.50 MiB
00177 signal_cache total: 1.34 MiB active: 1.34 MiB
00177 kmalloc-2k total: 1.16 MiB active: 1.16 MiB
00177 bch_inode_info total: 1.02 MiB active: 922 KiB
00177 perf_event total: 1.02 MiB active: 1.02 MiB
00177 biovec-max total: 992 KiB active: 960 KiB
00177 Shrinkers:
00177 super_cache_scan: objects: 127
00177 super_cache_scan: objects: 106
00177 jbd2_journal_shrink_scan: objects: 32
00177 ext4_es_scan: objects: 32
00177 bch2_btree_cache_scan: objects: 8
00177 nr nodes: 24
00177 nr dirty: 0
00177 cannibalize lock: 0000000000000000
00177
00177 super_cache_scan: objects: 8
00177 super_cache_scan: objects: 1
Cc: Andrew Morton <[email protected]>
Cc: Qi Zheng <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: [email protected]
Signed-off-by: Kent Overstreet <[email protected]>
---
mm/oom_kill.c | 23 ---------------------
mm/show_mem.c | 20 +++++++++++++++++++
mm/slab.h | 6 ++++--
mm/slab_common.c | 52 +++++++++++++++++++++++++++++++++++++++---------
4 files changed, 67 insertions(+), 34 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e6071fde34a..4b825a2b353f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p)
return false;
}
-/*
- * Check whether unreclaimable slab amount is greater than
- * all user memory(LRU pages).
- * dump_unreclaimable_slab() could help in the case that
- * oom due to too much unreclaimable slab used by kernel.
-*/
-static bool should_dump_unreclaim_slab(void)
-{
- unsigned long nr_lru;
-
- nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
- global_node_page_state(NR_INACTIVE_ANON) +
- global_node_page_state(NR_ACTIVE_FILE) +
- global_node_page_state(NR_INACTIVE_FILE) +
- global_node_page_state(NR_ISOLATED_ANON) +
- global_node_page_state(NR_ISOLATED_FILE) +
- global_node_page_state(NR_UNEVICTABLE);
-
- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
-}
-
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
@@ -462,8 +441,6 @@ static void dump_header(struct oom_control *oc)
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
__show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
- if (should_dump_unreclaim_slab())
- dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index ba0808d6917f..ab258ab1161c 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -12,10 +12,12 @@
#include <linux/hugetlb.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/seq_buf.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
#include "internal.h"
+#include "slab.h"
#include "swap.h"
atomic_long_t _totalram_pages __read_mostly;
@@ -401,6 +403,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long total = 0, reserved = 0, highmem = 0;
struct zone *zone;
+ char *buf;
printk("Mem-Info:\n");
show_free_areas(filter, nodemask, max_zone_idx);
@@ -423,4 +426,21 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
#ifdef CONFIG_MEMORY_FAILURE
printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
#endif
+
+ buf = kmalloc(4096, GFP_ATOMIC);
+ if (buf) {
+ struct seq_buf s;
+
+ printk("Unreclaimable slab info:\n");
+ seq_buf_init(&s, buf, 4096);
+ dump_unreclaimable_slab(&s);
+ printk("%s", seq_buf_str(&s));
+
+ printk("Shrinkers:\n");
+ seq_buf_init(&s, buf, 4096);
+ shrinkers_to_text(&s);
+ printk("%s", seq_buf_str(&s));
+
+ kfree(buf);
+ }
}
diff --git a/mm/slab.h b/mm/slab.h
index 3d07fb428393..c16358a8424c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -818,10 +818,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
if ((__n = get_node(__s, __node)))
+struct seq_buf;
+
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
-void dump_unreclaimable_slab(void);
+void dump_unreclaimable_slab(struct seq_buf *);
#else
-static inline void dump_unreclaimable_slab(void)
+static inline void dump_unreclaimable_slab(struct seq_buf *out)
{
}
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8d431193c273..1eadff512422 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
+#include <linux/seq_buf.h>
#include <linux/stackdepot.h>
#include "internal.h"
@@ -1295,10 +1296,15 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-void dump_unreclaimable_slab(void)
+void dump_unreclaimable_slab(struct seq_buf *out)
{
struct kmem_cache *s;
struct slabinfo sinfo;
+ struct slab_by_mem {
+ struct kmem_cache *s;
+ size_t total, active;
+ } slabs_by_mem[10], n;
+ int i, nr = 0;
/*
* Here acquiring slab_mutex is risky since we don't prefer to get
@@ -1308,24 +1314,52 @@ void dump_unreclaimable_slab(void)
* without acquiring the mutex.
*/
if (!mutex_trylock(&slab_mutex)) {
- pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n");
return;
}
- pr_info("Unreclaimable slab info:\n");
- pr_info("Name Used Total\n");
-
list_for_each_entry(s, &slab_caches, list) {
if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
- if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", s->name,
- (sinfo.active_objs * s->size) / 1024,
- (sinfo.num_objs * s->size) / 1024);
+ if (!sinfo.num_objs)
+ continue;
+
+ n.s = s;
+ n.total = sinfo.num_objs * s->size;
+ n.active = sinfo.active_objs * s->size;
+
+ for (i = 0; i < nr; i++)
+ if (n.total < slabs_by_mem[i].total)
+ break;
+
+ if (nr < ARRAY_SIZE(slabs_by_mem)) {
+ memmove(&slabs_by_mem[i + 1],
+ &slabs_by_mem[i],
+ sizeof(slabs_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&slabs_by_mem[0],
+ &slabs_by_mem[1],
+ sizeof(slabs_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ slabs_by_mem[i] = n;
+ }
+
+ for (i = nr - 1; i >= 0; --i) {
+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total, STRING_UNITS_2);
+ seq_buf_printf(out, " active: ");
+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active, STRING_UNITS_2);
+ seq_buf_putc(out, '\n');
}
+
mutex_unlock(&slab_mutex);
}
--
2.42.0
From: Daniel Hill <[email protected]>
This adds distinct counters for every reason the btree node shrinker can
fail to free an object - if our shrinker isn't making progress, this
will tell us why.
Signed-off-by: Daniel Hill <[email protected]>
Signed-off-by: Kent Overstreet <[email protected]>
---
fs/bcachefs/btree_cache.c | 91 +++++++++++++++++++++++++++++----------
fs/bcachefs/btree_cache.h | 2 +-
fs/bcachefs/btree_types.h | 10 +++++
fs/bcachefs/sysfs.c | 2 +-
4 files changed, 81 insertions(+), 24 deletions(-)
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index aab279dff944..72dea90e12fa 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -15,6 +15,12 @@
#include <linux/sched/mm.h>
#include <linux/seq_buf.h>
+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
+do { \
+ if (shrinker_counter) \
+ bc->not_freed_##counter++; \
+} while (0)
+
const char * const bch2_btree_node_flags[] = {
#define x(f) #f,
BTREE_FLAGS()
@@ -202,7 +208,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
@@ -212,38 +218,64 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_dirty(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
+ else if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
- if (!six_trylock_intent(&b->c.lock))
+ if (!six_trylock_intent(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ }
- if (!six_trylock_write(&b->c.lock))
+ if (!six_trylock_write(&b->c.lock)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
goto out_unlock_intent;
+ }
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
- if (!flush)
+ if (!flush) {
+ if (btree_node_read_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ else if (btree_node_write_in_flight(b))
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
goto out_unlock;
+ }
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
- if (btree_node_noevict(b) ||
- btree_node_write_blocked(b) ||
- btree_node_will_make_reachable(b))
+ if (btree_node_noevict(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
+ goto out_unlock;
+ }
+ if (btree_node_write_blocked(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
+ goto out_unlock;
+ }
+ if (btree_node_will_make_reachable(b)) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
goto out_unlock;
+ }
if (btree_node_dirty(b)) {
- if (!flush)
+ if (!flush) {
+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
goto out_unlock;
+ }
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@@ -273,14 +305,14 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
goto out;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
{
- return __btree_node_reclaim(c, b, false);
+ return __btree_node_reclaim(c, b, false, shrinker_counter);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true);
+ return __btree_node_reclaim(c, b, true, false);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -328,11 +360,12 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, true)) {
btree_node_data_free(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
+ bc->freed++;
}
}
restart:
@@ -341,9 +374,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (btree_node_accessed(b)) {
clear_btree_node_accessed(b);
- } else if (!btree_node_reclaim(c, b)) {
+ bc->not_freed_access_bit++;
+ } else if (!btree_node_reclaim(c, b, true)) {
freed++;
btree_node_data_free(c, b);
+ bc->freed++;
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->c.lock);
@@ -400,7 +435,7 @@ static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker
size_t buflen = seq_buf_get_buf(s, &cbuf);
struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
- bch2_btree_cache_to_text(&out, c);
+ bch2_btree_cache_to_text(&out, &c->btree_cache);
seq_buf_commit(s, out.pos);
}
@@ -564,7 +599,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
- if (!btree_node_reclaim(c, b))
+ if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
@@ -600,7 +635,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b)) {
+ if (!btree_node_reclaim(c, b, false)) {
list_del_init(&b->list);
goto got_node;
}
@@ -626,7 +661,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2)) {
+ if (!btree_node_reclaim(c, b2, false)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
btree_node_to_freedlist(bc, b2);
@@ -1222,9 +1257,21 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
stats.failed);
}
-void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c)
+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
{
- prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
- prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
+ prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+
+ prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
+ prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
+ prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
+ prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
+ prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
+ prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
+ prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
+ prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
+ prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
+ prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
+
}
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index cfb80b201d61..bfe1d7482cbc 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -126,6 +126,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
const char *bch2_btree_id_str(enum btree_id);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
-void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
#endif /* _BCACHEFS_BTREE_CACHE_H */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2326bceb34f8..14983e778756 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -162,6 +162,16 @@ struct btree_cache {
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
+ unsigned freed;
+ unsigned not_freed_lock_intent;
+ unsigned not_freed_lock_write;
+ unsigned not_freed_dirty;
+ unsigned not_freed_read_in_flight;
+ unsigned not_freed_write_in_flight;
+ unsigned not_freed_noevict;
+ unsigned not_freed_write_blocked;
+ unsigned not_freed_will_make_reachable;
+ unsigned not_freed_access_bit;
atomic_t dirty;
struct shrinker *shrink;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ab743115f169..264c46b456c2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -402,7 +402,7 @@ SHOW(bch2_fs)
bch2_btree_updates_to_text(out, c);
if (attr == &sysfs_btree_cache)
- bch2_btree_cache_to_text(out, c);
+ bch2_btree_cache_to_text(out, &c->btree_cache);
if (attr == &sysfs_btree_key_cache)
bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
--
2.42.0
Hi Kent,
On 2023/11/23 07:25, Kent Overstreet wrote:
> This adds a new callback method to shrinkers which they can use to
> describe anything relevant to memory reclaim about their internal state,
> for example object dirtyness.
I have no objection to this proposal, but maybe it would be better to
place this feature under CONFIG_SHRINKER_DEBUG?
Hi Roman, what do you think?
Also +CC Dave.
More comments below.
>
> This patch also adds shrinkers_to_text(), which reports on the top 10
> shrinkers - by object count - in sorted order, to be used in OOM
> reporting.
>
> Cc: Andrew Morton <[email protected]>
> Cc: Qi Zheng <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: [email protected]
> Signed-off-by: Kent Overstreet <[email protected]>
> ---
> include/linux/shrinker.h | 6 +++-
> mm/shrinker.c | 73 +++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 77 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
> index 1a00be90d93a..968c55474e78 100644
> --- a/include/linux/shrinker.h
> +++ b/include/linux/shrinker.h
> @@ -24,6 +24,8 @@ struct shrinker_info {
> struct shrinker_info_unit *unit[];
> };
>
> +struct seq_buf;
> +
> /*
> * This struct is used to pass information from page reclaim to the shrinkers.
> * We consolidate the values for easier extension later.
> @@ -80,10 +82,12 @@ struct shrink_control {
> * @flags determine the shrinker abilities, like numa awareness
> */
> struct shrinker {
> + const char *name;
> unsigned long (*count_objects)(struct shrinker *,
> struct shrink_control *sc);
> unsigned long (*scan_objects)(struct shrinker *,
> struct shrink_control *sc);
> + void (*to_text)(struct seq_buf *, struct shrinker *);
The "to_text" looks a little strange, how about naming it
"stat_objects"?
>
> long batch; /* reclaim batch size, 0 = default */
> int seeks; /* seeks to recreate an obj */
> @@ -110,7 +114,6 @@ struct shrinker {
> #endif
> #ifdef CONFIG_SHRINKER_DEBUG
> int debugfs_id;
> - const char *name;
The name will only be allocated memory when the CONFIG_SHRINKER_DEBUG is
enabled, otherwise its value is NULL. So you can't move it out and use
it while CONFIG_SHRINKER_DEBUG is disabled.
> struct dentry *debugfs_entry;
> #endif
> /* objs pending delete, per node */
> @@ -135,6 +138,7 @@ __printf(2, 3)
> struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
> void shrinker_register(struct shrinker *shrinker);
> void shrinker_free(struct shrinker *shrinker);
> +void shrinkers_to_text(struct seq_buf *);
>
> static inline bool shrinker_try_get(struct shrinker *shrinker)
> {
> diff --git a/mm/shrinker.c b/mm/shrinker.c
> index dd91eab43ed3..4976dbac4c83 100644
> --- a/mm/shrinker.c
> +++ b/mm/shrinker.c
> @@ -1,8 +1,9 @@
> // SPDX-License-Identifier: GPL-2.0
> #include <linux/memcontrol.h>
> +#include <linux/rculist.h>
> #include <linux/rwsem.h>
> +#include <linux/seq_buf.h>
> #include <linux/shrinker.h>
> -#include <linux/rculist.h>
> #include <trace/events/vmscan.h>
>
> #include "internal.h"
> @@ -807,3 +808,73 @@ void shrinker_free(struct shrinker *shrinker)
> call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
> }
> EXPORT_SYMBOL_GPL(shrinker_free);
> +
> +void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
> +{
> + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
> +
> + seq_buf_puts(out, shrinker->name);
> + seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
> +
> + if (shrinker->to_text) {
> + shrinker->to_text(out, shrinker);
> + seq_buf_puts(out, "\n");
> + }
> +}
> +
> +/**
> + * shrinkers_to_text - Report on shrinkers with highest usage
> + *
> + * This reports on the top 10 shrinkers, by object counts, in sorted order:
> + * intended to be used for OOM reporting.
> + */
> +void shrinkers_to_text(struct seq_buf *out)
> +{
> + struct shrinker *shrinker;
> + struct shrinker_by_mem {
> + struct shrinker *shrinker;
> + unsigned long mem;
The "mem" also looks a little strange, how about naming it
"freeable"?
> + } shrinkers_by_mem[10];
> + int i, nr = 0;
> +
> + if (!mutex_trylock(&shrinker_mutex)) {
> + seq_buf_puts(out, "(couldn't take shrinker lock)");
> + return;
> + }
We now have lockless method (RCU + refcount) to iterate shrinker_list,
please refer to shrink_slab().
Thanks,
Qi
> +
> + list_for_each_entry(shrinker, &shrinker_list, list) {
> + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
> + unsigned long mem = shrinker->count_objects(shrinker, &sc);
> +
> + if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
> + continue;
> +
> + for (i = 0; i < nr; i++)
> + if (mem < shrinkers_by_mem[i].mem)
> + break;
> +
> + if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
> + memmove(&shrinkers_by_mem[i + 1],
> + &shrinkers_by_mem[i],
> + sizeof(shrinkers_by_mem[0]) * (nr - i));
> + nr++;
> + } else if (i) {
> + i--;
> + memmove(&shrinkers_by_mem[0],
> + &shrinkers_by_mem[1],
> + sizeof(shrinkers_by_mem[0]) * i);
> + } else {
> + continue;
> + }
> +
> + shrinkers_by_mem[i] = (struct shrinker_by_mem) {
> + .shrinker = shrinker,
> + .mem = mem,
> + };
> + }
> +
> + for (i = nr - 1; i >= 0; --i)
> + shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
> +
> + mutex_unlock(&shrinker_mutex);
> +}
On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
> > + void (*to_text)(struct seq_buf *, struct shrinker *);
>
> The "to_text" looks a little strange, how about naming it
> "stat_objects"?
The convention I've been using heavily in bcachefs is
typename_to_text(), or type.to_text(), for debug reports. The
consistency is nice.
>
> > long batch; /* reclaim batch size, 0 = default */
> > int seeks; /* seeks to recreate an obj */
> > @@ -110,7 +114,6 @@ struct shrinker {
> > #endif
> > #ifdef CONFIG_SHRINKER_DEBUG
> > int debugfs_id;
> > - const char *name;
>
> The name will only be allocated memory when the CONFIG_SHRINKER_DEBUG is
> enabled, otherwise its value is NULL. So you can't move it out and use
> it while CONFIG_SHRINKER_DEBUG is disabled.
Good catch
> > +void shrinkers_to_text(struct seq_buf *out)
> > +{
> > + struct shrinker *shrinker;
> > + struct shrinker_by_mem {
> > + struct shrinker *shrinker;
> > + unsigned long mem;
>
> The "mem" also looks a little strange, how about naming it
> "freeable"?
The type is just used in this one function, but sure.
>
> > + } shrinkers_by_mem[10];
> > + int i, nr = 0;
> > +
> > + if (!mutex_trylock(&shrinker_mutex)) {
> > + seq_buf_puts(out, "(couldn't take shrinker lock)");
> > + return;
> > + }
>
> We now have lockless method (RCU + refcount) to iterate shrinker_list,
> please refer to shrink_slab().
Will do!
Hi Kent,
On 2023/11/24 05:24, Kent Overstreet wrote:
> On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
>>> + void (*to_text)(struct seq_buf *, struct shrinker *);
>>
>> The "to_text" looks a little strange, how about naming it
>> "stat_objects"?
>
> The convention I've been using heavily in bcachefs is
> typename_to_text(), or type.to_text(), for debug reports. The
OK.
> consistency is nice.
However, this is inconsistent with the name style of other
shrinker callbacks. Please use the "objects" suffix. As for
bcachefs's own callback function, you can use typename_to_text()
to ensure consistency.
Thanks,
Qi
>
>>
>>> long batch; /* reclaim batch size, 0 = default */
>>> int seeks; /* seeks to recreate an obj */
>>> @@ -110,7 +114,6 @@ struct shrinker {
>>> #endif
>>> #ifdef CONFIG_SHRINKER_DEBUG
>>> int debugfs_id;
>>> - const char *name;
>>
>> The name will only be allocated memory when the CONFIG_SHRINKER_DEBUG is
>> enabled, otherwise its value is NULL. So you can't move it out and use
>> it while CONFIG_SHRINKER_DEBUG is disabled.
>
> Good catch
>
>>> +void shrinkers_to_text(struct seq_buf *out)
>>> +{
>>> + struct shrinker *shrinker;
>>> + struct shrinker_by_mem {
>>> + struct shrinker *shrinker;
>>> + unsigned long mem;
>>
>> The "mem" also looks a little strange, how about naming it
>> "freeable"?
>
> The type is just used in this one function, but sure.
>
>>
>>> + } shrinkers_by_mem[10];
>>> + int i, nr = 0;
>>> +
>>> + if (!mutex_trylock(&shrinker_mutex)) {
>>> + seq_buf_puts(out, "(couldn't take shrinker lock)");
>>> + return;
>>> + }
>>
>> We now have lockless method (RCU + refcount) to iterate shrinker_list,
>> please refer to shrink_slab().
>
> Will do!
Hi Kent,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v6.7-rc2 next-20231124]
[cannot apply to vbabka-slab/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Kent-Overstreet/seq_buf-seq_buf_human_readable_u64/20231123-111937
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20231122232515.177833-3-kent.overstreet%40linux.dev
patch subject: [PATCH 2/7] mm: shrinker: Add a .to_text() method for shrinkers
config: x86_64-allnoconfig (https://download.01.org/0day-ci/archive/20231124/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231124/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All warnings (new ones prefixed by >>):
>> mm/shrinker.c:832: warning: Function parameter or member 'out' not described in 'shrinkers_to_text'
vim +832 mm/shrinker.c
824
825 /**
826 * shrinkers_to_text - Report on shrinkers with highest usage
827 *
828 * This reports on the top 10 shrinkers, by object counts, in sorted order:
829 * intended to be used for OOM reporting.
830 */
831 void shrinkers_to_text(struct seq_buf *out)
> 832 {
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Fri, Nov 24, 2023 at 11:08:11AM +0800, Qi Zheng wrote:
> Hi Kent,
>
> On 2023/11/24 05:24, Kent Overstreet wrote:
> > On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
> > > > + void (*to_text)(struct seq_buf *, struct shrinker *);
> > >
> > > The "to_text" looks a little strange, how about naming it
> > > "stat_objects"?
> >
> > The convention I've been using heavily in bcachefs is
> > typename_to_text(), or type.to_text(), for debug reports. The
>
> OK.
>
> > consistency is nice.
>
> However, this is inconsistent with the name style of other
> shrinker callbacks. Please use the "objects" suffix. As for
> bcachefs's own callback function, you can use typename_to_text()
> to ensure consistency.
That would be inconsistent with introducing a convention to the wider
kernel.
> On Nov 25, 2023, at 08:30, Kent Overstreet <[email protected]> wrote:
>
> On Fri, Nov 24, 2023 at 11:08:11AM +0800, Qi Zheng wrote:
>> Hi Kent,
>>
>> On 2023/11/24 05:24, Kent Overstreet wrote:
>>> On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
>>>>> + void (*to_text)(struct seq_buf *, struct shrinker *);
>>>>
>>>> The "to_text" looks a little strange, how about naming it
>>>> "stat_objects"?
>>>
>>> The convention I've been using heavily in bcachefs is
>>> typename_to_text(), or type.to_text(), for debug reports. The
>>
>> OK.
>>
>>> consistency is nice.
>>
>> However, this is inconsistent with the name style of other
>> shrinker callbacks. Please use the "objects" suffix. As for
>> bcachefs's own callback function, you can use typename_to_text()
>> to ensure consistency.
>
> That would be inconsistent with introducing a convention to the wider
> kernel.
>
I don not think .to_text is a good name. I really do not know what it means
when I first look at this name. I knew you want to report the objects of
shrinks, so why not use .report_objects or stat_objects proposed by Qi.
Although .to_text is only used by bcachefs now, shrinker is a general module
which is not only serving the bcachefs itself. I think it should be better
to use a more straightforward name.
Muchun,
Thanks.
On Tue, Nov 28, 2023 at 11:27:11AM +0800, Muchun Song wrote:
>
>
> > On Nov 25, 2023, at 08:30, Kent Overstreet <[email protected]> wrote:
> >
> > On Fri, Nov 24, 2023 at 11:08:11AM +0800, Qi Zheng wrote:
> >> Hi Kent,
> >>
> >> On 2023/11/24 05:24, Kent Overstreet wrote:
> >>> On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
> >>>>> + void (*to_text)(struct seq_buf *, struct shrinker *);
> >>>>
> >>>> The "to_text" looks a little strange, how about naming it
> >>>> "stat_objects"?
> >>>
> >>> The convention I've been using heavily in bcachefs is
> >>> typename_to_text(), or type.to_text(), for debug reports. The
> >>
> >> OK.
> >>
> >>> consistency is nice.
> >>
> >> However, this is inconsistent with the name style of other
> >> shrinker callbacks. Please use the "objects" suffix. As for
> >> bcachefs's own callback function, you can use typename_to_text()
> >> to ensure consistency.
> >
> > That would be inconsistent with introducing a convention to the wider
> > kernel.
> >
>
> I don not think .to_text is a good name. I really do not know what it means
> when I first look at this name. I knew you want to report the objects of
> shrinks, so why not use .report_objects or stat_objects proposed by Qi.
> Although .to_text is only used by bcachefs now, shrinker is a general module
> which is not only serving the bcachefs itself. I think it should be better
> to use a more straightforward name.
No, .report_objects or .stat_objects would be wrong; this isn't
generating a report on the objects owned by the shrinker, it's just a
report on the statistics of the shrinker itself.
That's why the convention is typename_to_text() - generate a text
representation of an object of that type.
On 2023/11/28 11:53, Kent Overstreet wrote:
> On Tue, Nov 28, 2023 at 11:27:11AM +0800, Muchun Song wrote:
>>
>>
>>> On Nov 25, 2023, at 08:30, Kent Overstreet <[email protected]> wrote:
>>>
>>> On Fri, Nov 24, 2023 at 11:08:11AM +0800, Qi Zheng wrote:
>>>> Hi Kent,
>>>>
>>>> On 2023/11/24 05:24, Kent Overstreet wrote:
>>>>> On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
>>>>>>> + void (*to_text)(struct seq_buf *, struct shrinker *);
>>>>>>
>>>>>> The "to_text" looks a little strange, how about naming it
>>>>>> "stat_objects"?
>>>>>
>>>>> The convention I've been using heavily in bcachefs is
>>>>> typename_to_text(), or type.to_text(), for debug reports. The
>>>>
>>>> OK.
>>>>
>>>>> consistency is nice.
>>>>
>>>> However, this is inconsistent with the name style of other
>>>> shrinker callbacks. Please use the "objects" suffix. As for
>>>> bcachefs's own callback function, you can use typename_to_text()
>>>> to ensure consistency.
>>>
>>> That would be inconsistent with introducing a convention to the wider
>>> kernel.
>>>
>>
>> I don not think .to_text is a good name. I really do not know what it means
>> when I first look at this name. I knew you want to report the objects of
>> shrinks, so why not use .report_objects or stat_objects proposed by Qi.
>> Although .to_text is only used by bcachefs now, shrinker is a general module
>> which is not only serving the bcachefs itself. I think it should be better
>> to use a more straightforward name.
>
> No, .report_objects or .stat_objects would be wrong; this isn't
> generating a report on the objects owned by the shrinker, it's just a
> report on the statistics of the shrinker itself.
Now I think adding this method might not be a good idea. If we allow
shrinkers to report thier own private information, OOM logs may become
cluttered. Most people only care about some general information when
troubleshooting OOM problem, but not the private information of a
shrinker.
So I thought maybe we could add some general statistics to the shrinker,
but adding private ".to_text" method is not necessary.
Also +CC Michal, who is reviwing OOM related patches recently.
Thanks,
Qi
>
> That's why the convention is typename_to_text() - generate a text
> representation of an object of that type.
On Wed 22-11-23 18:25:05, Kent Overstreet wrote:
> This patchset does a few things to aid in OOM debugging, in particular
> when shrinkers are involved:
>
> - improves the show_mem OOM report: it now reports on shrinkers, and
> for both shrinkers and slab we only report the top 10 entries,
> sorted, not the full list
>
> - add shrinker_to_text(), for the show_mem report and debugfs, and a
> an optional shrinker.to_text() callback to report extra
> driver-specific information
>
> - add extra counters for the shrinker_to_text() report
>
> - implement shrinker.to_text() for bcachefs, giving an example of how
> to use the callbacks
Could you expand some more about all these? What is the additional
information you can get and how usable that is? Some examples would be
really useful in the cover letter to establish grounds for the
discussion.
/me is looking at patches to find out more.
--
Michal Hocko
SUSE Labs
On Wed 22-11-23 18:25:07, Kent Overstreet wrote:
[...]
> +void shrinkers_to_text(struct seq_buf *out)
> +{
> + struct shrinker *shrinker;
> + struct shrinker_by_mem {
> + struct shrinker *shrinker;
> + unsigned long mem;
> + } shrinkers_by_mem[10];
> + int i, nr = 0;
> +
> + if (!mutex_trylock(&shrinker_mutex)) {
> + seq_buf_puts(out, "(couldn't take shrinker lock)");
> + return;
> + }
> +
> + list_for_each_entry(shrinker, &shrinker_list, list) {
> + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
This seems to be global reclaim specific. What about memcg reclaim?
--
Michal Hocko
SUSE Labs
On Wed 22-11-23 18:25:09, Kent Overstreet wrote:
[...]
> 00177 Shrinkers:
> 00177 super_cache_scan: objects: 127
> 00177 super_cache_scan: objects: 106
> 00177 jbd2_journal_shrink_scan: objects: 32
> 00177 ext4_es_scan: objects: 32
> 00177 bch2_btree_cache_scan: objects: 8
> 00177 nr nodes: 24
> 00177 nr dirty: 0
> 00177 cannibalize lock: 0000000000000000
> 00177
> 00177 super_cache_scan: objects: 8
> 00177 super_cache_scan: objects: 1
It would be really great to provide an example on how these numbers are
useful for the oom evaluation.
[...]
> @@ -423,4 +426,21 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> #ifdef CONFIG_MEMORY_FAILURE
> printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> #endif
> +
> + buf = kmalloc(4096, GFP_ATOMIC);
I really do not think we want to allow allocations from the OOM context.
Is there any reason why this cannot be a statically allocated buffer?
> + if (buf) {
> + struct seq_buf s;
> +
> + printk("Unreclaimable slab info:\n");
> + seq_buf_init(&s, buf, 4096);
> + dump_unreclaimable_slab(&s);
> + printk("%s", seq_buf_str(&s));
> +
> + printk("Shrinkers:\n");
> + seq_buf_init(&s, buf, 4096);
> + shrinkers_to_text(&s);
> + printk("%s", seq_buf_str(&s));
> +
> + kfree(buf);
> + }
> }
--
Michal Hocko
SUSE Labs
On Tue, Nov 28, 2023 at 11:01:16AM +0100, Michal Hocko wrote:
> On Wed 22-11-23 18:25:07, Kent Overstreet wrote:
> [...]
> > +void shrinkers_to_text(struct seq_buf *out)
> > +{
> > + struct shrinker *shrinker;
> > + struct shrinker_by_mem {
> > + struct shrinker *shrinker;
> > + unsigned long mem;
> > + } shrinkers_by_mem[10];
> > + int i, nr = 0;
> > +
> > + if (!mutex_trylock(&shrinker_mutex)) {
> > + seq_buf_puts(out, "(couldn't take shrinker lock)");
> > + return;
> > + }
> > +
> > + list_for_each_entry(shrinker, &shrinker_list, list) {
> > + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
>
> This seems to be global reclaim specific. What about memcg reclaim?
I have no fsckin idea how memcg reclaim works - and, for that matter,
the recent lockless shrinking work seems to have neglected to write even
an iterator macro, leaving _that_ a nasty mess so I'm not touching that
either.
On Tue, Nov 28, 2023 at 11:07:18AM +0100, Michal Hocko wrote:
> On Wed 22-11-23 18:25:09, Kent Overstreet wrote:
> [...]
> > 00177 Shrinkers:
> > 00177 super_cache_scan: objects: 127
> > 00177 super_cache_scan: objects: 106
> > 00177 jbd2_journal_shrink_scan: objects: 32
> > 00177 ext4_es_scan: objects: 32
> > 00177 bch2_btree_cache_scan: objects: 8
> > 00177 nr nodes: 24
> > 00177 nr dirty: 0
> > 00177 cannibalize lock: 0000000000000000
> > 00177
> > 00177 super_cache_scan: objects: 8
> > 00177 super_cache_scan: objects: 1
>
> It would be really great to provide an example on how these numbers are
> useful for the oom evaluation.
I should've posted an example from the end of the patch series; I'll do
that later today.
> [...]
> > @@ -423,4 +426,21 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > #ifdef CONFIG_MEMORY_FAILURE
> > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > #endif
> > +
> > + buf = kmalloc(4096, GFP_ATOMIC);
>
> I really do not think we want to allow allocations from the OOM context.
> Is there any reason why this cannot be a statically allocated buffer?
You've made this claim before without ever giving any reasoning behind
it.
It's GFP_ATOMIC; it has to work from _interrupt_ context, OOM context is
fine.
And no, we don't want to burn 4k on a static buffer that is almost never
used; people do care about making the kernel run on small systems.
On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
>
>
> On 2023/11/28 11:53, Kent Overstreet wrote:
> > On Tue, Nov 28, 2023 at 11:27:11AM +0800, Muchun Song wrote:
> > >
> > >
> > > > On Nov 25, 2023, at 08:30, Kent Overstreet <[email protected]> wrote:
> > > >
> > > > On Fri, Nov 24, 2023 at 11:08:11AM +0800, Qi Zheng wrote:
> > > > > Hi Kent,
> > > > >
> > > > > On 2023/11/24 05:24, Kent Overstreet wrote:
> > > > > > On Thu, Nov 23, 2023 at 11:32:59AM +0800, Qi Zheng wrote:
> > > > > > > > + void (*to_text)(struct seq_buf *, struct shrinker *);
> > > > > > >
> > > > > > > The "to_text" looks a little strange, how about naming it
> > > > > > > "stat_objects"?
> > > > > >
> > > > > > The convention I've been using heavily in bcachefs is
> > > > > > typename_to_text(), or type.to_text(), for debug reports. The
> > > > >
> > > > > OK.
> > > > >
> > > > > > consistency is nice.
> > > > >
> > > > > However, this is inconsistent with the name style of other
> > > > > shrinker callbacks. Please use the "objects" suffix. As for
> > > > > bcachefs's own callback function, you can use typename_to_text()
> > > > > to ensure consistency.
> > > >
> > > > That would be inconsistent with introducing a convention to the wider
> > > > kernel.
> > > >
> > >
> > > I don not think .to_text is a good name. I really do not know what it means
> > > when I first look at this name. I knew you want to report the objects of
> > > shrinks, so why not use .report_objects or stat_objects proposed by Qi.
> > > Although .to_text is only used by bcachefs now, shrinker is a general module
> > > which is not only serving the bcachefs itself. I think it should be better
> > > to use a more straightforward name.
> >
> > No, .report_objects or .stat_objects would be wrong; this isn't
> > generating a report on the objects owned by the shrinker, it's just a
> > report on the statistics of the shrinker itself.
>
> Now I think adding this method might not be a good idea. If we allow
> shrinkers to report thier own private information, OOM logs may become
> cluttered. Most people only care about some general information when
> troubleshooting OOM problem, but not the private information of a
> shrinker.
I agree with that.
It seems that the feature is mostly useful for kernel developers and it's easily
achievable by attaching a bpf program to the oom handler. If it requires a bit
of work on the bpf side, we can do that instead, but probably not. And this
solution can potentially provide way more information in a more flexible way.
So I'm not convinced it's a good idea to make the generic oom handling code
more complicated and fragile for everybody, as well as making oom reports differ
more between kernel versions and configurations.
Thanks!
On Tue 28-11-23 12:54:39, Kent Overstreet wrote:
> On Tue, Nov 28, 2023 at 11:07:18AM +0100, Michal Hocko wrote:
[...]
> > > @@ -423,4 +426,21 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > #ifdef CONFIG_MEMORY_FAILURE
> > > printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > #endif
> > > +
> > > + buf = kmalloc(4096, GFP_ATOMIC);
> >
> > I really do not think we want to allow allocations from the OOM context.
> > Is there any reason why this cannot be a statically allocated buffer?
>
> You've made this claim before without ever giving any reasoning behind
> it.
We are actively out of memory at the stage you would like to allocate
memory. I am pretty sure I have mentioned this argument in the past.
> It's GFP_ATOMIC; it has to work from _interrupt_ context, OOM context is
> fine.
This is a completely _different_ contexts. GFP_ATOMIC works around IRQ
or any atomic context inability to wait for the reclaim by accessing the
memory reserves. At the _global_ oom situation those reserves are
extremely scarce resource. Debugging data is certainly not something we
would want to compete with e.g. oom victims or their dependencies that
might need to access those reserves in order to make a forward progress.
Especially in case where the said debugging data is not detrimental to
analyse the OOM situation. And to be completely honest, I haven't really
seen any strong arguments for shrinker internal state dumping other than
_in some very limited_ cases this _might_ be useful.
> And no, we don't want to burn 4k on a static buffer that is almost never
> used; people do care about making the kernel run on small systems.
WE DO NOT ALLOCATE FROM OOM context, not to mention for something as
disposable as debugging data which usefulness is not really clear. If
there are systems which cannot effort a 4kb for static buffer then just
disable this debugging data.
--
Michal Hocko
SUSE Labs
On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
> On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
[...]
> > Now I think adding this method might not be a good idea. If we allow
> > shrinkers to report thier own private information, OOM logs may become
> > cluttered. Most people only care about some general information when
> > troubleshooting OOM problem, but not the private information of a
> > shrinker.
>
> I agree with that.
>
> It seems that the feature is mostly useful for kernel developers and it's easily
> achievable by attaching a bpf program to the oom handler. If it requires a bit
> of work on the bpf side, we can do that instead, but probably not. And this
> solution can potentially provide way more information in a more flexible way.
>
> So I'm not convinced it's a good idea to make the generic oom handling code
> more complicated and fragile for everybody, as well as making oom reports differ
> more between kernel versions and configurations.
Completely agreed! From my many years of experience of oom reports
analysing from production systems I would conclude the following categories
- clear runaways (and/or memory leaks)
- userspace consumers - either shmem or anonymous memory
predominantly consumes the memory, swap is either depleted
or not configured.
OOM report is usually useful to pinpoint those as we
have required counters available
- kernel memory consumers - if we are lucky they are
using slab allocator and unreclaimable slab is a huge
part of the memory consumption. If this is a page
allocator user the oom repport only helps to deduce
the fact by looking at how much user + slab + page
table etc. form. But identifying the root cause is
close to impossible without something like page_owner
or a crash dump.
- misbehaving memory reclaim
- minority of issues and the oom report is usually
insufficient to drill down to the root cause. If the
problem is reproducible then collecting vmstat data
can give a much better clue.
- high number of slab reclaimable objects or free swap
are good indicators. Shrinkers data could be
potentially helpful in the slab case but I really have
hard time to remember any such situation.
On non-production systems the situation is quite different. I can see
how it could be very beneficial to add a very specific debugging data
for subsystem/shrinker which is developed and could cause the OOM. For
that purpose the proposed scheme is rather inflexible AFAICS.
--
Michal Hocko
SUSE Labs
On Tue 28-11-23 12:48:53, Kent Overstreet wrote:
> On Tue, Nov 28, 2023 at 11:01:16AM +0100, Michal Hocko wrote:
> > On Wed 22-11-23 18:25:07, Kent Overstreet wrote:
> > [...]
> > > +void shrinkers_to_text(struct seq_buf *out)
> > > +{
> > > + struct shrinker *shrinker;
> > > + struct shrinker_by_mem {
> > > + struct shrinker *shrinker;
> > > + unsigned long mem;
> > > + } shrinkers_by_mem[10];
> > > + int i, nr = 0;
> > > +
> > > + if (!mutex_trylock(&shrinker_mutex)) {
> > > + seq_buf_puts(out, "(couldn't take shrinker lock)");
> > > + return;
> > > + }
> > > +
> > > + list_for_each_entry(shrinker, &shrinker_list, list) {
> > > + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
> >
> > This seems to be global reclaim specific. What about memcg reclaim?
>
> I have no fsckin idea how memcg reclaim works - and, for that matter,
> the recent lockless shrinking work seems to have neglected to write even
> an iterator macro, leaving _that_ a nasty mess so I'm not touching that
> either.
OK, you could have made it more clearly that all of this is aiming at
the global OOM handling. With an outlook on what should be done if this
was ever required.
Another thing you want to look into is a NUMA constrained OOM (mbind,
cpuset) where this output could be actively misleading.
--
Michal Hocko
SUSE Labs
On Wed, Nov 29, 2023 at 05:02:35PM +0100, Michal Hocko wrote:
> On Tue 28-11-23 12:48:53, Kent Overstreet wrote:
> > On Tue, Nov 28, 2023 at 11:01:16AM +0100, Michal Hocko wrote:
> > > On Wed 22-11-23 18:25:07, Kent Overstreet wrote:
> > > [...]
> > > > +void shrinkers_to_text(struct seq_buf *out)
> > > > +{
> > > > + struct shrinker *shrinker;
> > > > + struct shrinker_by_mem {
> > > > + struct shrinker *shrinker;
> > > > + unsigned long mem;
> > > > + } shrinkers_by_mem[10];
> > > > + int i, nr = 0;
> > > > +
> > > > + if (!mutex_trylock(&shrinker_mutex)) {
> > > > + seq_buf_puts(out, "(couldn't take shrinker lock)");
> > > > + return;
> > > > + }
> > > > +
> > > > + list_for_each_entry(shrinker, &shrinker_list, list) {
> > > > + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
> > >
> > > This seems to be global reclaim specific. What about memcg reclaim?
> >
> > I have no fsckin idea how memcg reclaim works - and, for that matter,
> > the recent lockless shrinking work seems to have neglected to write even
> > an iterator macro, leaving _that_ a nasty mess so I'm not touching that
> > either.
>
> OK, you could have made it more clearly that all of this is aiming at
> the global OOM handling. With an outlook on what should be done if this
> was ever required.
>
> Another thing you want to look into is a NUMA constrained OOM (mbind,
> cpuset) where this output could be actively misleading.
Yeah.
It's not clear to me how (if at all) we want memcg to be represented in
this output; it's not something us filesystem developers think about a
lot. NUMA definitely should, though.
On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
> On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
> > On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
> [...]
> > > Now I think adding this method might not be a good idea. If we allow
> > > shrinkers to report thier own private information, OOM logs may become
> > > cluttered. Most people only care about some general information when
> > > troubleshooting OOM problem, but not the private information of a
> > > shrinker.
> >
> > I agree with that.
> >
> > It seems that the feature is mostly useful for kernel developers and it's easily
> > achievable by attaching a bpf program to the oom handler. If it requires a bit
> > of work on the bpf side, we can do that instead, but probably not. And this
> > solution can potentially provide way more information in a more flexible way.
> >
> > So I'm not convinced it's a good idea to make the generic oom handling code
> > more complicated and fragile for everybody, as well as making oom reports differ
> > more between kernel versions and configurations.
>
> Completely agreed! From my many years of experience of oom reports
> analysing from production systems I would conclude the following categories
> - clear runaways (and/or memory leaks)
> - userspace consumers - either shmem or anonymous memory
> predominantly consumes the memory, swap is either depleted
> or not configured.
> OOM report is usually useful to pinpoint those as we
> have required counters available
> - kernel memory consumers - if we are lucky they are
> using slab allocator and unreclaimable slab is a huge
> part of the memory consumption. If this is a page
> allocator user the oom repport only helps to deduce
> the fact by looking at how much user + slab + page
> table etc. form. But identifying the root cause is
> close to impossible without something like page_owner
> or a crash dump.
> - misbehaving memory reclaim
> - minority of issues and the oom report is usually
> insufficient to drill down to the root cause. If the
> problem is reproducible then collecting vmstat data
> can give a much better clue.
> - high number of slab reclaimable objects or free swap
> are good indicators. Shrinkers data could be
> potentially helpful in the slab case but I really have
> hard time to remember any such situation.
> On non-production systems the situation is quite different. I can see
> how it could be very beneficial to add a very specific debugging data
> for subsystem/shrinker which is developed and could cause the OOM. For
> that purpose the proposed scheme is rather inflexible AFAICS.
Considering that you're an MM guy, and that shrinkers are pretty much
universally used by _filesystem_ people - I'm not sure your experience
is the most relevant here?
The general attitude I've been seeing in this thread has been one of
dismissiveness towards filesystem people. Roman too; back when he was
working on his shrinker debug feature I reached out to him, explained
that I was working on my own, and asked about collaborating - got
crickets in response...
Hmm..
Besides that, I haven't seen anything what-so-ever out of you guys to
make our lives easier, regarding OOM debugging, nor do you guys even
seem interested in the needs and perspectives of the filesytem people.
Roman, your feature didn't help one bit for OOM debuging - didn't even
come with documentation or hints as to what it's for.
BPF? Please.
Anyways.
Regarding log spam: that's something this patchset already starts to
address. I don't think we needed to be dumping every single slab in the
system, for ~2 pages worth of logs; hence this patchset changes that to
just print the top 10.
The same approach is taken with shrinkers: more targeted, less spammy
output.
So now that that concern has been addressed, perhaps some actual meat:
For one, the patchset adds tracking for when a shrinker was last asked
to free something, vs. when it was actually freed. So right there, we
can finally see at a glance when a shrinker has gotten stuck and which
one.
Next up, why has a shrinker gotten stuck?
That's why the .to_text() callback is needed: _shrinkers have internal
state_, and the reasons objects may not be getting freed are specific to
a given shrinker implementation. In bcachefs we added counters for each
individual reason an object may be skipped by the shrinker (io in
flight? that debugged a runaway prefetch issue. too many dirty? that
points to journal reclaim).
I'm working on a .to_text() function for the struct super_block
shrinker, will post that one soon...
On 2023/11/30 07:11, Kent Overstreet wrote:
> On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
>> On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
>>> On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
>> [...]
>>>> Now I think adding this method might not be a good idea. If we allow
>>>> shrinkers to report thier own private information, OOM logs may become
>>>> cluttered. Most people only care about some general information when
>>>> troubleshooting OOM problem, but not the private information of a
>>>> shrinker.
>>>
>>> I agree with that.
>>>
>>> It seems that the feature is mostly useful for kernel developers and it's easily
>>> achievable by attaching a bpf program to the oom handler. If it requires a bit
>>> of work on the bpf side, we can do that instead, but probably not. And this
>>> solution can potentially provide way more information in a more flexible way.
>>>
>>> So I'm not convinced it's a good idea to make the generic oom handling code
>>> more complicated and fragile for everybody, as well as making oom reports differ
>>> more between kernel versions and configurations.
>>
>> Completely agreed! From my many years of experience of oom reports
>> analysing from production systems I would conclude the following categories
>> - clear runaways (and/or memory leaks)
>> - userspace consumers - either shmem or anonymous memory
>> predominantly consumes the memory, swap is either depleted
>> or not configured.
>> OOM report is usually useful to pinpoint those as we
>> have required counters available
>> - kernel memory consumers - if we are lucky they are
>> using slab allocator and unreclaimable slab is a huge
>> part of the memory consumption. If this is a page
>> allocator user the oom repport only helps to deduce
>> the fact by looking at how much user + slab + page
>> table etc. form. But identifying the root cause is
>> close to impossible without something like page_owner
>> or a crash dump.
>> - misbehaving memory reclaim
>> - minority of issues and the oom report is usually
>> insufficient to drill down to the root cause. If the
>> problem is reproducible then collecting vmstat data
>> can give a much better clue.
>> - high number of slab reclaimable objects or free swap
>> are good indicators. Shrinkers data could be
>> potentially helpful in the slab case but I really have
>> hard time to remember any such situation.
>> On non-production systems the situation is quite different. I can see
>> how it could be very beneficial to add a very specific debugging data
>> for subsystem/shrinker which is developed and could cause the OOM. For
>> that purpose the proposed scheme is rather inflexible AFAICS.
>
> Considering that you're an MM guy, and that shrinkers are pretty much
> universally used by _filesystem_ people - I'm not sure your experience
> is the most relevant here?
>
> The general attitude I've been seeing in this thread has been one of
> dismissiveness towards filesystem people. Roman too; back when he was
Oh, please don't say that, it seems like you are the only one causing
the fight. We deeply respect the opinions of file system developers, so
I invited Dave to this thread from the beginning. And you didn’t CC
[email protected] yourself.
> working on his shrinker debug feature I reached out to him, explained
> that I was working on my own, and asked about collaborating - got
> crickets in response...
>
> Hmm..
>
> Besides that, I haven't seen anything what-so-ever out of you guys to
> make our lives easier, regarding OOM debugging, nor do you guys even
> seem interested in the needs and perspectives of the filesytem people.
> Roman, your feature didn't help one bit for OOM debuging - didn't even
> come with documentation or hints as to what it's for.
>
> BPF? Please.
(Disclaimer, no intention to start a fight, here are some objective
views.)
Why not? In addition to printk, there are many good debugging tools
worth trying, such as BPF related tools, drgn, etc.
For non-bcachefs developers, who knows what those statistics mean?
You can use BPF or drgn to traverse in advance to get the address of the
bcachefs shrinker structure, and then during OOM, find the bcachefs
private structure through the shrinker->private_data member, and then
dump the bcachefs private data. Is there any problem with this?
Thanks,
Qi
>
> Anyways.
>
> Regarding log spam: that's something this patchset already starts to
> address. I don't think we needed to be dumping every single slab in the
> system, for ~2 pages worth of logs; hence this patchset changes that to
> just print the top 10.
>
> The same approach is taken with shrinkers: more targeted, less spammy
> output.
>
> So now that that concern has been addressed, perhaps some actual meat:
>
> For one, the patchset adds tracking for when a shrinker was last asked
> to free something, vs. when it was actually freed. So right there, we
> can finally see at a glance when a shrinker has gotten stuck and which
> one.
>
> Next up, why has a shrinker gotten stuck?
>
> That's why the .to_text() callback is needed: _shrinkers have internal
> state_, and the reasons objects may not be getting freed are specific to
> a given shrinker implementation. In bcachefs we added counters for each
> individual reason an object may be skipped by the shrinker (io in
> flight? that debugged a runaway prefetch issue. too many dirty? that
> points to journal reclaim).
>
> I'm working on a .to_text() function for the struct super_block
> shrinker, will post that one soon...
On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
>
>
> On 2023/11/30 07:11, Kent Overstreet wrote:
> > On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
> > > On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
> > > > On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
> > > [...]
> > > > > Now I think adding this method might not be a good idea. If we allow
> > > > > shrinkers to report thier own private information, OOM logs may become
> > > > > cluttered. Most people only care about some general information when
> > > > > troubleshooting OOM problem, but not the private information of a
> > > > > shrinker.
> > > >
> > > > I agree with that.
> > > >
> > > > It seems that the feature is mostly useful for kernel developers and it's easily
> > > > achievable by attaching a bpf program to the oom handler. If it requires a bit
> > > > of work on the bpf side, we can do that instead, but probably not. And this
> > > > solution can potentially provide way more information in a more flexible way.
> > > >
> > > > So I'm not convinced it's a good idea to make the generic oom handling code
> > > > more complicated and fragile for everybody, as well as making oom reports differ
> > > > more between kernel versions and configurations.
> > >
> > > Completely agreed! From my many years of experience of oom reports
> > > analysing from production systems I would conclude the following categories
> > > - clear runaways (and/or memory leaks)
> > > - userspace consumers - either shmem or anonymous memory
> > > predominantly consumes the memory, swap is either depleted
> > > or not configured.
> > > OOM report is usually useful to pinpoint those as we
> > > have required counters available
> > > - kernel memory consumers - if we are lucky they are
> > > using slab allocator and unreclaimable slab is a huge
> > > part of the memory consumption. If this is a page
> > > allocator user the oom repport only helps to deduce
> > > the fact by looking at how much user + slab + page
> > > table etc. form. But identifying the root cause is
> > > close to impossible without something like page_owner
> > > or a crash dump.
> > > - misbehaving memory reclaim
> > > - minority of issues and the oom report is usually
> > > insufficient to drill down to the root cause. If the
> > > problem is reproducible then collecting vmstat data
> > > can give a much better clue.
> > > - high number of slab reclaimable objects or free swap
> > > are good indicators. Shrinkers data could be
> > > potentially helpful in the slab case but I really have
> > > hard time to remember any such situation.
> > > On non-production systems the situation is quite different. I can see
> > > how it could be very beneficial to add a very specific debugging data
> > > for subsystem/shrinker which is developed and could cause the OOM. For
> > > that purpose the proposed scheme is rather inflexible AFAICS.
> >
> > Considering that you're an MM guy, and that shrinkers are pretty much
> > universally used by _filesystem_ people - I'm not sure your experience
> > is the most relevant here?
> >
> > The general attitude I've been seeing in this thread has been one of
> > dismissiveness towards filesystem people. Roman too; back when he was
>
> Oh, please don't say that, it seems like you are the only one causing
> the fight. We deeply respect the opinions of file system developers, so
> I invited Dave to this thread from the beginning. And you didn’t CC
> [email protected] yourself.
>
> > working on his shrinker debug feature I reached out to him, explained
> > that I was working on my own, and asked about collaborating - got
> > crickets in response...
> >
> > Hmm..
> >
> > Besides that, I haven't seen anything what-so-ever out of you guys to
> > make our lives easier, regarding OOM debugging, nor do you guys even
> > seem interested in the needs and perspectives of the filesytem people.
> > Roman, your feature didn't help one bit for OOM debuging - didn't even
> > come with documentation or hints as to what it's for.
> >
> > BPF? Please.
>
> (Disclaimer, no intention to start a fight, here are some objective
> views.)
>
> Why not? In addition to printk, there are many good debugging tools
> worth trying, such as BPF related tools, drgn, etc.
>
> For non-bcachefs developers, who knows what those statistics mean?
>
> You can use BPF or drgn to traverse in advance to get the address of the
> bcachefs shrinker structure, and then during OOM, find the bcachefs
> private structure through the shrinker->private_data member, and then
> dump the bcachefs private data. Is there any problem with this?
No, BPF is not an excuse for improving our OOM/allocation failure
reports. BPF/tracing are secondary tools; whenever we're logging
information about a problem we should strive to log enough information
to debug the issue.
We've got junk in there we don't need: as mentioned before, there's no
need to be dumping information on _every_ slab, we can pick the ones
using the most memory and show those.
Similarly for shrinkers, we're not going to be printing all of them -
the patchset picks the top 10 by objects and prints those. Could
probably be ~4, there's fewer shrinkers than slabs; also if we can get
shrinkers to report on memory owned in bytes, that will help too with
deciding what information is pertinent.
That's not a huge amount of information to be dumping, and to make it
easier to debug something that has historically been a major pain point.
There's a lot more that could be done to make our OOM reports more
readable and useful to non-mm developers. Unfortunately, any time
changing the show_mem report the immediate reaction seems to be "but
that will break my log parsing/change what I'm used to!"...
On 2023/11/30 11:21, Kent Overstreet wrote:
> On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
>>
>>
>> On 2023/11/30 07:11, Kent Overstreet wrote:
>>> On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
>>>> On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
>>>>> On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
>>>> [...]
>>>>>> Now I think adding this method might not be a good idea. If we allow
>>>>>> shrinkers to report thier own private information, OOM logs may become
>>>>>> cluttered. Most people only care about some general information when
>>>>>> troubleshooting OOM problem, but not the private information of a
>>>>>> shrinker.
>>>>>
>>>>> I agree with that.
>>>>>
>>>>> It seems that the feature is mostly useful for kernel developers and it's easily
>>>>> achievable by attaching a bpf program to the oom handler. If it requires a bit
>>>>> of work on the bpf side, we can do that instead, but probably not. And this
>>>>> solution can potentially provide way more information in a more flexible way.
>>>>>
>>>>> So I'm not convinced it's a good idea to make the generic oom handling code
>>>>> more complicated and fragile for everybody, as well as making oom reports differ
>>>>> more between kernel versions and configurations.
>>>>
>>>> Completely agreed! From my many years of experience of oom reports
>>>> analysing from production systems I would conclude the following categories
>>>> - clear runaways (and/or memory leaks)
>>>> - userspace consumers - either shmem or anonymous memory
>>>> predominantly consumes the memory, swap is either depleted
>>>> or not configured.
>>>> OOM report is usually useful to pinpoint those as we
>>>> have required counters available
>>>> - kernel memory consumers - if we are lucky they are
>>>> using slab allocator and unreclaimable slab is a huge
>>>> part of the memory consumption. If this is a page
>>>> allocator user the oom repport only helps to deduce
>>>> the fact by looking at how much user + slab + page
>>>> table etc. form. But identifying the root cause is
>>>> close to impossible without something like page_owner
>>>> or a crash dump.
>>>> - misbehaving memory reclaim
>>>> - minority of issues and the oom report is usually
>>>> insufficient to drill down to the root cause. If the
>>>> problem is reproducible then collecting vmstat data
>>>> can give a much better clue.
>>>> - high number of slab reclaimable objects or free swap
>>>> are good indicators. Shrinkers data could be
>>>> potentially helpful in the slab case but I really have
>>>> hard time to remember any such situation.
>>>> On non-production systems the situation is quite different. I can see
>>>> how it could be very beneficial to add a very specific debugging data
>>>> for subsystem/shrinker which is developed and could cause the OOM. For
>>>> that purpose the proposed scheme is rather inflexible AFAICS.
>>>
>>> Considering that you're an MM guy, and that shrinkers are pretty much
>>> universally used by _filesystem_ people - I'm not sure your experience
>>> is the most relevant here?
>>>
>>> The general attitude I've been seeing in this thread has been one of
>>> dismissiveness towards filesystem people. Roman too; back when he was
>>
>> Oh, please don't say that, it seems like you are the only one causing
>> the fight. We deeply respect the opinions of file system developers, so
>> I invited Dave to this thread from the beginning. And you didn’t CC
>> [email protected] yourself.
>>
>>> working on his shrinker debug feature I reached out to him, explained
>>> that I was working on my own, and asked about collaborating - got
>>> crickets in response...
>>>
>>> Hmm..
>>>
>>> Besides that, I haven't seen anything what-so-ever out of you guys to
>>> make our lives easier, regarding OOM debugging, nor do you guys even
>>> seem interested in the needs and perspectives of the filesytem people.
>>> Roman, your feature didn't help one bit for OOM debuging - didn't even
>>> come with documentation or hints as to what it's for.
>>>
>>> BPF? Please.
>>
>> (Disclaimer, no intention to start a fight, here are some objective
>> views.)
>>
>> Why not? In addition to printk, there are many good debugging tools
>> worth trying, such as BPF related tools, drgn, etc.
>>
>> For non-bcachefs developers, who knows what those statistics mean?
>>
>> You can use BPF or drgn to traverse in advance to get the address of the
>> bcachefs shrinker structure, and then during OOM, find the bcachefs
>> private structure through the shrinker->private_data member, and then
>> dump the bcachefs private data. Is there any problem with this?
>
> No, BPF is not an excuse for improving our OOM/allocation failure
> reports. BPF/tracing are secondary tools; whenever we're logging
> information about a problem we should strive to log enough information
> to debug the issue.
>
> We've got junk in there we don't need: as mentioned before, there's no
> need to be dumping information on _every_ slab, we can pick the ones
> using the most memory and show those.
>
> Similarly for shrinkers, we're not going to be printing all of them -
> the patchset picks the top 10 by objects and prints those. Could
> probably be ~4, there's fewer shrinkers than slabs; also if we can get
> shrinkers to report on memory owned in bytes, that will help too with
> deciding what information is pertinent.
I'm not worried about the shrinker's general data. What I'm worried
about is the shrinker's private data. Except for the corresponding
developers, others don't know the meaning of the private statistical
data, and we have no control over the printing quantity and form of
the private data. This may indeed cause OOM log confusion and failure
to automatically parse. For this, any thoughts?
>
> That's not a huge amount of information to be dumping, and to make it
> easier to debug something that has historically been a major pain point.
>
> There's a lot more that could be done to make our OOM reports more
> readable and useful to non-mm developers. Unfortunately, any time
> changing the show_mem report the immediate reaction seems to be "but
> that will break my log parsing/change what I'm used to!"...
On Thu, Nov 30, 2023 at 11:42:45AM +0800, Qi Zheng wrote:
> > Similarly for shrinkers, we're not going to be printing all of them -
> > the patchset picks the top 10 by objects and prints those. Could
> > probably be ~4, there's fewer shrinkers than slabs; also if we can get
> > shrinkers to report on memory owned in bytes, that will help too with
> > deciding what information is pertinent.
>
> I'm not worried about the shrinker's general data. What I'm worried
> about is the shrinker's private data. Except for the corresponding
> developers, others don't know the meaning of the private statistical
> data, and we have no control over the printing quantity and form of
> the private data. This may indeed cause OOM log confusion and failure
> to automatically parse. For this, any thoughts?
If a shrinker is responsible for the OOM, then that private state is
exactly what is needed to debug the issue.
I explained this earlier - shrinkers can skip reclaiming objects for a
variety of reasons; in bcachefs's case that could be trylock() failure,
an IO in flight, the node being dirty, and more. Unlock the system inode
shrinker, it's much too expensive to move objects on and off the
shrinker list whenever they're touched. Again, this all comes from real
world experience.
The show_mem report is already full of numbers with zero explanation of
how they're relevant for debugging OOMS; we really need to improve how
that is presented as well.
On Wed 29-11-23 18:11:47, Kent Overstreet wrote:
> On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
> > On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
> > > On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
> > [...]
> > > > Now I think adding this method might not be a good idea. If we allow
> > > > shrinkers to report thier own private information, OOM logs may become
> > > > cluttered. Most people only care about some general information when
> > > > troubleshooting OOM problem, but not the private information of a
> > > > shrinker.
> > >
> > > I agree with that.
> > >
> > > It seems that the feature is mostly useful for kernel developers and it's easily
> > > achievable by attaching a bpf program to the oom handler. If it requires a bit
> > > of work on the bpf side, we can do that instead, but probably not. And this
> > > solution can potentially provide way more information in a more flexible way.
> > >
> > > So I'm not convinced it's a good idea to make the generic oom handling code
> > > more complicated and fragile for everybody, as well as making oom reports differ
> > > more between kernel versions and configurations.
> >
> > Completely agreed! From my many years of experience of oom reports
> > analysing from production systems I would conclude the following categories
> > - clear runaways (and/or memory leaks)
> > - userspace consumers - either shmem or anonymous memory
> > predominantly consumes the memory, swap is either depleted
> > or not configured.
> > OOM report is usually useful to pinpoint those as we
> > have required counters available
> > - kernel memory consumers - if we are lucky they are
> > using slab allocator and unreclaimable slab is a huge
> > part of the memory consumption. If this is a page
> > allocator user the oom repport only helps to deduce
> > the fact by looking at how much user + slab + page
> > table etc. form. But identifying the root cause is
> > close to impossible without something like page_owner
> > or a crash dump.
> > - misbehaving memory reclaim
> > - minority of issues and the oom report is usually
> > insufficient to drill down to the root cause. If the
> > problem is reproducible then collecting vmstat data
> > can give a much better clue.
> > - high number of slab reclaimable objects or free swap
> > are good indicators. Shrinkers data could be
> > potentially helpful in the slab case but I really have
> > hard time to remember any such situation.
> > On non-production systems the situation is quite different. I can see
> > how it could be very beneficial to add a very specific debugging data
> > for subsystem/shrinker which is developed and could cause the OOM. For
> > that purpose the proposed scheme is rather inflexible AFAICS.
>
> Considering that you're an MM guy, and that shrinkers are pretty much
> universally used by _filesystem_ people - I'm not sure your experience
> is the most relevant here?
I really do not understand where you have concluded that. In those years
of analysis I was not debugging my _own_ code. I was dealing with
customer reports and I would not really blame them to specifically
trigger any class of OOM reports.
> The general attitude I've been seeing in this thread has been one of
> dismissiveness towards filesystem people. Roman too; back when he was
> working on his shrinker debug feature I reached out to him, explained
> that I was working on my own, and asked about collaborating - got
> crickets in response...
This is completely off and it makes me _really_ think whether
discussions like this on is really worth time. You have been presented
arguments, you seem to be convinced that every disagreement is against
you. Not the first time this is happening. Stop it please!
As a matter of fact, you are proposeing a very specific form of
debugging without showing that this is generally useful thing to do or
even giving us couple of examples where that was useful in a production
environment. This is where you should have started at and then we could
help out to form an acceptable solution. Throwing "this does what we
need, take it or leave" attitude is usualy not the best way to get your
work merged.
> Hmm..
>
> Besides that, I haven't seen anything what-so-ever out of you guys to
> make our lives easier, regarding OOM debugging, nor do you guys even
> seem interested in the needs and perspectives of the filesytem people.
> Roman, your feature didn't help one bit for OOM debuging - didn't even
> come with documentation or hints as to what it's for.
>
> BPF? Please.
>
> Anyways.
>
> Regarding log spam: that's something this patchset already starts to
> address. I don't think we needed to be dumping every single slab in the
> system, for ~2 pages worth of logs; hence this patchset changes that to
> just print the top 10.
Increasing the threshold for slabs to be printed is something I wouldn't
mind at all.
> The same approach is taken with shrinkers: more targeted, less spammy
> output.
>
> So now that that concern has been addressed, perhaps some actual meat:
>
> For one, the patchset adds tracking for when a shrinker was last asked
> to free something, vs. when it was actually freed. So right there, we
> can finally see at a glance when a shrinker has gotten stuck and which
> one.
The primary problem I have with this is how to decide whether to dump
shrinker data and/or which shrinkers to mention. How do you know that it
is the specific shrinker which has contributed to the OOM state?
Printing that data unconditionally will very likely be just additional
balast in most production situations. Sure if you are doing a filesystem
development and you are tuning your specific shrinker then this might be
a really important information to have. But then it is a debugging devel
tool rather than something we want or need to have in a generic oom
report.
All that being said, I am with you on the fact that the oom report in
its current form could see improvements. But please when adding more
information please always focus on general usefulness. We have a very
rich tracing capabilities which could be used for ad-hoc or very
specific purposes as it is much more flexible.
--
Michal Hocko
SUSE Labs
On Wed, Nov 29, 2023 at 10:21:49PM -0500, Kent Overstreet wrote:
> On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
> >
> >
> > On 2023/11/30 07:11, Kent Overstreet wrote:
> > > On Wed, Nov 29, 2023 at 10:14:54AM +0100, Michal Hocko wrote:
> > > > On Tue 28-11-23 16:34:35, Roman Gushchin wrote:
> > > > > On Tue, Nov 28, 2023 at 02:23:36PM +0800, Qi Zheng wrote:
> > > > [...]
> > > > > > Now I think adding this method might not be a good idea. If we allow
> > > > > > shrinkers to report thier own private information, OOM logs may become
> > > > > > cluttered. Most people only care about some general information when
> > > > > > troubleshooting OOM problem, but not the private information of a
> > > > > > shrinker.
> > > > >
> > > > > I agree with that.
> > > > >
> > > > > It seems that the feature is mostly useful for kernel developers and it's easily
> > > > > achievable by attaching a bpf program to the oom handler. If it requires a bit
> > > > > of work on the bpf side, we can do that instead, but probably not. And this
> > > > > solution can potentially provide way more information in a more flexible way.
> > > > >
> > > > > So I'm not convinced it's a good idea to make the generic oom handling code
> > > > > more complicated and fragile for everybody, as well as making oom reports differ
> > > > > more between kernel versions and configurations.
> > > >
> > > > Completely agreed! From my many years of experience of oom reports
> > > > analysing from production systems I would conclude the following categories
> > > > - clear runaways (and/or memory leaks)
> > > > - userspace consumers - either shmem or anonymous memory
> > > > predominantly consumes the memory, swap is either depleted
> > > > or not configured.
> > > > OOM report is usually useful to pinpoint those as we
> > > > have required counters available
> > > > - kernel memory consumers - if we are lucky they are
> > > > using slab allocator and unreclaimable slab is a huge
> > > > part of the memory consumption. If this is a page
> > > > allocator user the oom repport only helps to deduce
> > > > the fact by looking at how much user + slab + page
> > > > table etc. form. But identifying the root cause is
> > > > close to impossible without something like page_owner
> > > > or a crash dump.
> > > > - misbehaving memory reclaim
> > > > - minority of issues and the oom report is usually
> > > > insufficient to drill down to the root cause. If the
> > > > problem is reproducible then collecting vmstat data
> > > > can give a much better clue.
> > > > - high number of slab reclaimable objects or free swap
> > > > are good indicators. Shrinkers data could be
> > > > potentially helpful in the slab case but I really have
> > > > hard time to remember any such situation.
> > > > On non-production systems the situation is quite different. I can see
> > > > how it could be very beneficial to add a very specific debugging data
> > > > for subsystem/shrinker which is developed and could cause the OOM. For
> > > > that purpose the proposed scheme is rather inflexible AFAICS.
> > >
> > > Considering that you're an MM guy, and that shrinkers are pretty much
> > > universally used by _filesystem_ people - I'm not sure your experience
> > > is the most relevant here?
> > >
> > > The general attitude I've been seeing in this thread has been one of
> > > dismissiveness towards filesystem people. Roman too; back when he was
> >
> > Oh, please don't say that, it seems like you are the only one causing
> > the fight. We deeply respect the opinions of file system developers, so
> > I invited Dave to this thread from the beginning. And you didn't CC
> > [email protected] yourself.
> >
> > > working on his shrinker debug feature I reached out to him, explained
> > > that I was working on my own, and asked about collaborating - got
> > > crickets in response...
> > >
> > > Hmm..
> > >
> > > Besides that, I haven't seen anything what-so-ever out of you guys to
> > > make our lives easier, regarding OOM debugging, nor do you guys even
> > > seem interested in the needs and perspectives of the filesytem people.
> > > Roman, your feature didn't help one bit for OOM debuging - didn't even
> > > come with documentation or hints as to what it's for.
> > >
> > > BPF? Please.
> >
> > (Disclaimer, no intention to start a fight, here are some objective
> > views.)
> >
> > Why not? In addition to printk, there are many good debugging tools
> > worth trying, such as BPF related tools, drgn, etc.
> >
> > For non-bcachefs developers, who knows what those statistics mean?
> >
> > You can use BPF or drgn to traverse in advance to get the address of the
> > bcachefs shrinker structure, and then during OOM, find the bcachefs
> > private structure through the shrinker->private_data member, and then
> > dump the bcachefs private data. Is there any problem with this?
>
> No, BPF is not an excuse for improving our OOM/allocation failure
> reports. BPF/tracing are secondary tools; whenever we're logging
> information about a problem we should strive to log enough information
> to debug the issue.
Ok, a simple question then:
why can't you dump /proc/slabinfo after the OOM?
Unlike anon memory, slab memory (fs caches in particular) should not be heavily
affected by killing some userspace task.
Thanks.
On Thu, Nov 30, 2023 at 11:01:23AM -0800, Roman Gushchin wrote:
> Ok, a simple question then:
> why can't you dump /proc/slabinfo after the OOM?
>
> Unlike anon memory, slab memory (fs caches in particular) should not be heavily
> affected by killing some userspace task.
Well, currently the show_mem report dumps slab info if unreclaimable
slab usage is over some threshold (50%?). So it already does what you
describe - sometimes.
One of the patches in this series trims that down to make it more
useful; reporting on only the top 10 slabs, by mmeory usage, in sorted
order and with human readable units.
On Thu, Nov 30, 2023 at 11:01:23AM -0800, Roman Gushchin wrote:
> On Wed, Nov 29, 2023 at 10:21:49PM -0500, Kent Overstreet wrote:
> > On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
> > > For non-bcachefs developers, who knows what those statistics mean?
For non-mm developers, who knows what those internal mm state
statistics mean?
IOWs, a non-mm developer goes and asks a mm developer to
help them decipher the output to determine what to do next.
So why can't a mm developer go an ask a subsystem developer to tell
them what the shrinker oom-kill output means?
Such a question is a demonstration of an unconscious bias
that prioritises internal mm stuff as far more important than what
anyone else outside core-mm might ever need...
> > > You can use BPF or drgn to traverse in advance to get the address of the
> > > bcachefs shrinker structure, and then during OOM, find the bcachefs
> > > private structure through the shrinker->private_data member, and then
> > > dump the bcachefs private data. Is there any problem with this?
> >
> > No, BPF is not an excuse for improving our OOM/allocation failure
> > reports. BPF/tracing are secondary tools; whenever we're logging
> > information about a problem we should strive to log enough information
> > to debug the issue.
>
> Ok, a simple question then:
> why can't you dump /proc/slabinfo after the OOM?
Taken to it's logical conclusion, we arrive at:
OOM-kill doesn't need to output anything at all except for
what it killed because we can dump
/proc/{mem,zone,vmalloc,buddy,slab}info after the OOM....
As it is, even asking such a question shows that you haven't looked
at the OOM kill output for a long time - it already reports the slab
cache usage information for caches that are reclaimable.
That is, if too much accounted slab cache based memory consumption
is detected at OOM-kill, it will calldump_unreclaimable_slab() to
dump all the SLAB_RECLAIM_ACCOUNT caches (i.e. those with shrinkers)
to the console as part of the OOM-kill output.
The problem Kent is trying to address is that this output *isn't
sufficient to debug shrinker based memory reclaim issues*. It hasn't
been for a long time, and so we've all got our own special debug
patches and methods for checking that shrinkers are doing what they
are supposed to. Kent is trying to formalise one of the more useful
general methods for exposing that internal information when OOM
occurs...
Indeed, I can think of several uses for a shrinker->to_text() output
that we simply cannot do right now.
Any shrinker that does garbage collection on something that is not a
pure slab cache (e.g. xfs buffer cache, xfs inode gc subsystem,
graphics memory allocators, binder, etc) has no visibility of the
actuall memory being used by the subsystem in the OOM-kill output.
This information isn't in /proc/slabinfo, it's not accounted by a
SLAB_RECLAIM_ACCOUNT cache, and it's not accounted by anything in
the core mm statistics.
e.g. How does anyone other than a XFS expert know that the 500k of
active xfs_buf handles in the slab cache actually pins 15GB of
cached metadata allocated directly from the page allocator, not just
the 150MB of slab cache the handles take up?
Another example is that an inode can pin lots of heap memory (e.g.
for in-memory extent lists) and that may not be freeable until the
inode is reclaimed. So while the slab cache might not be excesively
large, we might have an a million inodes with a billion cumulative
extents cached in memory and it is the heap memory consumed by the
cached extents that is consuming the 30GB of "missing" kernel memory
that is causing OOM-kills to occur.
How is a user or developer supposed to know when one of these
situations has occurred given the current lack of memory usage
introspection into subsystems?
These are the sorts of situations that shrinker->to_text() would
allow us to enumerate when it is necessary (i.e. at OOM-kill). At
any other time, it just doesn't matter, but when we're at OOM having
a mechanism to report somewhat accurate subsystem memory consumption
would be very useful indeed.
> Unlike anon memory, slab memory (fs caches in particular) should not be heavily
> affected by killing some userspace task.
Whether tasks get killed or not is completely irrelevant. The issue
is that not all memory that is reclaimed by shrinkers is either pure
slab cache memory or directly accounted as reclaimable to the mm
subsystem....
-Dave.
--
Dave Chinner
[email protected]
On Thu, Nov 30, 2023 at 09:14:35AM +0100, Michal Hocko wrote:
> On Wed 29-11-23 18:11:47, Kent Overstreet wrote:
> > Considering that you're an MM guy, and that shrinkers are pretty much
> > universally used by _filesystem_ people - I'm not sure your experience
> > is the most relevant here?
>
> I really do not understand where you have concluded that. In those years
> of analysis I was not debugging my _own_ code. I was dealing with
> customer reports and I would not really blame them to specifically
> trigger any class of OOM reports.
I've also spent a considerable amount of time debugging OOM issues, and
a lot of that took a lot longer than it should of due to insufficient
visibility in what the system was doing.
I'm talking about things like tuning journal reclaim/writeback behaviour
(this is a tricky one! shrinkers can't shrink if all items are dirty,
but random update workloads really suffer if we're biasing too much in
favour of memory reclaim, i.e. limiting dirty ratio too much), or
debugging tests in fstests that really like to exhaust memory on just
the inode cache.
If you can take the time to understand what other people are trying to
do and share your own perspective on what you find useful - instead of
just saying "I've spent a lot of time on OOM reports and I haven't need
any of this/this is just for debugging" - we'll be able to have a much
more productive discussion.
Regarding another point you guys have been making - that this is "just
for developers debugging their own code" - that's a terribly dismissive
attitude to take as well.
Debugging doesn't stop when we're done testing the code on our local
machine and push it out to be merged; we're constantly debugging our
own code as it is running in the wild based on sparse bug reports with
at most a dmesg log. That dmesg log needs to, whenever possible, have
all the information we need to debug the issue.
In bcachefs, I have made this principle a _high_ priority; when I have a
bug in front of me, if there's visibility improvements that would make
the issue easier to debug I prioritize that _first_, and then fix the
actual bug. That's been one of the guiding principles that have enabled
me to work efficiently.
Code should tell you _what_ went wrong when something goes wrong,
whenever possible. Not just for ourselves, the individual developer, it
makes our code more maintainable by the people tha come after us.
> > For one, the patchset adds tracking for when a shrinker was last asked
> > to free something, vs. when it was actually freed. So right there, we
> > can finally see at a glance when a shrinker has gotten stuck and which
> > one.
>
> The primary problem I have with this is how to decide whether to dump
> shrinker data and/or which shrinkers to mention. How do you know that it
> is the specific shrinker which has contributed to the OOM state?
> Printing that data unconditionally will very likely be just additional
> balast in most production situations. Sure if you are doing a filesystem
> development and you are tuning your specific shrinker then this might be
> a really important information to have. But then it is a debugging devel
> tool rather than something we want or need to have in a generic oom
> report.
Like I've mentioned before, this patchset only reports on the top 10
shrinkers, by number of objects. If we can plumb through reporting on
memory usage in _bytes_, that would help even more with deciding what to
report on.
> All that being said, I am with you on the fact that the oom report in
> its current form could see improvements.
I'm glad we're finally in agreement on something!
If you want to share your own ideas on what could be improved and what
you find useful, maybe we could find some more common ground.
On Thu 30-11-23 20:47:45, Kent Overstreet wrote:
> On Thu, Nov 30, 2023 at 09:14:35AM +0100, Michal Hocko wrote:
[...]
> > All that being said, I am with you on the fact that the oom report in
> > its current form could see improvements.
>
> I'm glad we're finally in agreement on something!
>
> If you want to share your own ideas on what could be improved and what
> you find useful, maybe we could find some more common ground.
One thing that I would consider an improvement is to have a way to
subscribe drivers with excessive memory consumption or those which are
struggling to dump their state.
Maybe your proposal can be extended that way but the crucial point is to
not dump all sorts of random shrinkers' state and end up with unwieldy
reports. If, on the other hand, any particular shrinker struggles to
reclaim memory and it is sitting on a lot of memory it could be able to
flag itself to be involved in the dump.
--
Michal Hocko
SUSE Labs
On Fri, Dec 01, 2023 at 12:18:44PM +1100, Dave Chinner wrote:
> On Thu, Nov 30, 2023 at 11:01:23AM -0800, Roman Gushchin wrote:
> > On Wed, Nov 29, 2023 at 10:21:49PM -0500, Kent Overstreet wrote:
> > > On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
> > > > For non-bcachefs developers, who knows what those statistics mean?
>
> > Ok, a simple question then:
> > why can't you dump /proc/slabinfo after the OOM?
>
> Taken to it's logical conclusion, we arrive at:
>
> OOM-kill doesn't need to output anything at all except for
> what it killed because we can dump
> /proc/{mem,zone,vmalloc,buddy,slab}info after the OOM....
>
> As it is, even asking such a question shows that you haven't looked
> at the OOM kill output for a long time - it already reports the slab
> cache usage information for caches that are reclaimable.
>
> That is, if too much accounted slab cache based memory consumption
> is detected at OOM-kill, it will calldump_unreclaimable_slab() to
> dump all the SLAB_RECLAIM_ACCOUNT caches (i.e. those with shrinkers)
> to the console as part of the OOM-kill output.
You are right, I missed that, partially because most of OOM's I had to deal
with recently were memcg OOM's.
This changes my perspective at Kent's patches, if we dump this information
already, it might be not a bad idea to do it nicer. So I take my words back
here.
>
> The problem Kent is trying to address is that this output *isn't
> sufficient to debug shrinker based memory reclaim issues*. It hasn't
> been for a long time, and so we've all got our own special debug
> patches and methods for checking that shrinkers are doing what they
> are supposed to. Kent is trying to formalise one of the more useful
> general methods for exposing that internal information when OOM
> occurs...
>
> Indeed, I can think of several uses for a shrinker->to_text() output
> that we simply cannot do right now.
>
> Any shrinker that does garbage collection on something that is not a
> pure slab cache (e.g. xfs buffer cache, xfs inode gc subsystem,
> graphics memory allocators, binder, etc) has no visibility of the
> actuall memory being used by the subsystem in the OOM-kill output.
> This information isn't in /proc/slabinfo, it's not accounted by a
> SLAB_RECLAIM_ACCOUNT cache, and it's not accounted by anything in
> the core mm statistics.
>
> e.g. How does anyone other than a XFS expert know that the 500k of
> active xfs_buf handles in the slab cache actually pins 15GB of
> cached metadata allocated directly from the page allocator, not just
> the 150MB of slab cache the handles take up?
>
> Another example is that an inode can pin lots of heap memory (e.g.
> for in-memory extent lists) and that may not be freeable until the
> inode is reclaimed. So while the slab cache might not be excesively
> large, we might have an a million inodes with a billion cumulative
> extents cached in memory and it is the heap memory consumed by the
> cached extents that is consuming the 30GB of "missing" kernel memory
> that is causing OOM-kills to occur.
>
> How is a user or developer supposed to know when one of these
> situations has occurred given the current lack of memory usage
> introspection into subsystems?
What would be the proper solution to this problem from your point of view?
What functionality/API mm can provide to make the life of fs developers
better here?
>
> These are the sorts of situations that shrinker->to_text() would
> allow us to enumerate when it is necessary (i.e. at OOM-kill). At
> any other time, it just doesn't matter, but when we're at OOM having
> a mechanism to report somewhat accurate subsystem memory consumption
> would be very useful indeed.
>
> > Unlike anon memory, slab memory (fs caches in particular) should not be heavily
> > affected by killing some userspace task.
>
> Whether tasks get killed or not is completely irrelevant. The issue
> is that not all memory that is reclaimed by shrinkers is either pure
> slab cache memory or directly accounted as reclaimable to the mm
> subsystem....
My problem with the current OOM reporting infrastructure (and it's a bit an
offtopic here) - it's good for manually looking into these reports, but not
particularly great for automatic collection and analysis at scale.
So this is where I was coming from.
Thanks!
On Fri, Dec 01, 2023 at 11:04:23AM +0100, Michal Hocko wrote:
> On Thu 30-11-23 20:47:45, Kent Overstreet wrote:
> > On Thu, Nov 30, 2023 at 09:14:35AM +0100, Michal Hocko wrote:
> [...]
> > > All that being said, I am with you on the fact that the oom report in
> > > its current form could see improvements.
> >
> > I'm glad we're finally in agreement on something!
> >
> > If you want to share your own ideas on what could be improved and what
> > you find useful, maybe we could find some more common ground.
>
> One thing that I would consider an improvement is to have a way to
> subscribe drivers with excessive memory consumption or those which are
> struggling to dump their state.
Remember the memory allocation profiling patchset? The one where you
kept complaining about "maintenancy overhead"?
We can plug that into the show_mem report too, and list the top 10
allocations by file and line number.
> Maybe your proposal can be extended that way but the crucial point is to
> not dump all sorts of random shrinkers' state and end up with unwieldy
> reports. If, on the other hand, any particular shrinker struggles to
> reclaim memory and it is sitting on a lot of memory it could be able to
> flag itself to be involved in the dump.
Great, since as was mentioned in the original commit message it's not
"all sorts of random shrinkers", but top 10 by objects reported, what
I've got here should make you happy.
On Fri, Dec 01, 2023 at 12:01:33PM -0800, Roman Gushchin wrote:
> My problem with the current OOM reporting infrastructure (and it's a bit an
> offtopic here) - it's good for manually looking into these reports, but not
> particularly great for automatic collection and analysis at scale.
> So this is where I was coming from.
Well, toml isn't the worst thing ever, as far as
machine-and-human-readable-formats go.
For those unfamiliar it's related to json: it's less general than json,
but in exchange it's much easier on the eyes.
On Fri 01-12-23 16:25:06, Kent Overstreet wrote:
> On Fri, Dec 01, 2023 at 11:04:23AM +0100, Michal Hocko wrote:
> > On Thu 30-11-23 20:47:45, Kent Overstreet wrote:
> > > On Thu, Nov 30, 2023 at 09:14:35AM +0100, Michal Hocko wrote:
> > [...]
> > > > All that being said, I am with you on the fact that the oom report in
> > > > its current form could see improvements.
> > >
> > > I'm glad we're finally in agreement on something!
> > >
> > > If you want to share your own ideas on what could be improved and what
> > > you find useful, maybe we could find some more common ground.
> >
> > One thing that I would consider an improvement is to have a way to
> > subscribe drivers with excessive memory consumption or those which are
> > struggling to dump their state.
>
> Remember the memory allocation profiling patchset? The one where you
> kept complaining about "maintenancy overhead"?
Yes, I still maintain my opinion on that approach. I have never
questioned usefulness of the information.
> We can plug that into the show_mem report too, and list the top 10
> allocations by file and line number.
>
> > Maybe your proposal can be extended that way but the crucial point is to
> > not dump all sorts of random shrinkers' state and end up with unwieldy
> > reports. If, on the other hand, any particular shrinker struggles to
> > reclaim memory and it is sitting on a lot of memory it could be able to
> > flag itself to be involved in the dump.
>
> Great, since as was mentioned in the original commit message it's not
> "all sorts of random shrinkers", but top 10 by objects reported, what
> I've got here should make you happy.
Can we do better and make that a shrinker decision rather than an
arbitrary top N selection? The thing is that shrinkers might even not
matter in many cases so their output would be just a balast. The number
of objects is not universaly great choice. As Dave mentioned metdata
might be pinning other objects.
That being said, if you want to give more debugability power to
shrinkers then it makes more sense to allow them to opt-in for the oom
report rather than control which of them to involve from the oom
reporting code which doesn't have enough context on its own.
--
Michal Hocko
SUSE Labs
On Mon, Dec 04, 2023 at 11:33:12AM +0100, Michal Hocko wrote:
> On Fri 01-12-23 16:25:06, Kent Overstreet wrote:
> > On Fri, Dec 01, 2023 at 11:04:23AM +0100, Michal Hocko wrote:
> > > On Thu 30-11-23 20:47:45, Kent Overstreet wrote:
> > > > On Thu, Nov 30, 2023 at 09:14:35AM +0100, Michal Hocko wrote:
> > > [...]
> > > > > All that being said, I am with you on the fact that the oom report in
> > > > > its current form could see improvements.
> > > >
> > > > I'm glad we're finally in agreement on something!
> > > >
> > > > If you want to share your own ideas on what could be improved and what
> > > > you find useful, maybe we could find some more common ground.
> > >
> > > One thing that I would consider an improvement is to have a way to
> > > subscribe drivers with excessive memory consumption or those which are
> > > struggling to dump their state.
> >
> > Remember the memory allocation profiling patchset? The one where you
> > kept complaining about "maintenancy overhead"?
>
> Yes, I still maintain my opinion on that approach. I have never
> questioned usefulness of the information.
>
> > We can plug that into the show_mem report too, and list the top 10
> > allocations by file and line number.
> >
> > > Maybe your proposal can be extended that way but the crucial point is to
> > > not dump all sorts of random shrinkers' state and end up with unwieldy
> > > reports. If, on the other hand, any particular shrinker struggles to
> > > reclaim memory and it is sitting on a lot of memory it could be able to
> > > flag itself to be involved in the dump.
> >
> > Great, since as was mentioned in the original commit message it's not
> > "all sorts of random shrinkers", but top 10 by objects reported, what
> > I've got here should make you happy.
>
> Can we do better and make that a shrinker decision rather than an
> arbitrary top N selection? The thing is that shrinkers might even not
> matter in many cases so their output would be just a balast. The number
> of objects is not universaly great choice. As Dave mentioned metdata
> might be pinning other objects.
>
> That being said, if you want to give more debugability power to
> shrinkers then it makes more sense to allow them to opt-in for the oom
> report rather than control which of them to involve from the oom
> reporting code which doesn't have enough context on its own.
If you've got an idea for a refinement, please submit your own patch and
I'll look at incorporating it into the series.
On Fri, Dec 01, 2023 at 12:01:33PM -0800, Roman Gushchin wrote:
> On Fri, Dec 01, 2023 at 12:18:44PM +1100, Dave Chinner wrote:
> > On Thu, Nov 30, 2023 at 11:01:23AM -0800, Roman Gushchin wrote:
> > > On Wed, Nov 29, 2023 at 10:21:49PM -0500, Kent Overstreet wrote:
> > > > On Thu, Nov 30, 2023 at 11:09:42AM +0800, Qi Zheng wrote:
> > > > > For non-bcachefs developers, who knows what those statistics mean?
> >
> > > Ok, a simple question then:
> > > why can't you dump /proc/slabinfo after the OOM?
> >
> > Taken to it's logical conclusion, we arrive at:
> >
> > OOM-kill doesn't need to output anything at all except for
> > what it killed because we can dump
> > /proc/{mem,zone,vmalloc,buddy,slab}info after the OOM....
> >
> > As it is, even asking such a question shows that you haven't looked
> > at the OOM kill output for a long time - it already reports the slab
> > cache usage information for caches that are reclaimable.
> >
> > That is, if too much accounted slab cache based memory consumption
> > is detected at OOM-kill, it will calldump_unreclaimable_slab() to
> > dump all the SLAB_RECLAIM_ACCOUNT caches (i.e. those with shrinkers)
> > to the console as part of the OOM-kill output.
>
> You are right, I missed that, partially because most of OOM's I had to deal
> with recently were memcg OOM's.
>
> This changes my perspective at Kent's patches, if we dump this information
> already, it might be not a bad idea to do it nicer. So I take my words back
> here.
>
> >
> > The problem Kent is trying to address is that this output *isn't
> > sufficient to debug shrinker based memory reclaim issues*. It hasn't
> > been for a long time, and so we've all got our own special debug
> > patches and methods for checking that shrinkers are doing what they
> > are supposed to. Kent is trying to formalise one of the more useful
> > general methods for exposing that internal information when OOM
> > occurs...
> >
> > Indeed, I can think of several uses for a shrinker->to_text() output
> > that we simply cannot do right now.
> >
> > Any shrinker that does garbage collection on something that is not a
> > pure slab cache (e.g. xfs buffer cache, xfs inode gc subsystem,
> > graphics memory allocators, binder, etc) has no visibility of the
> > actuall memory being used by the subsystem in the OOM-kill output.
> > This information isn't in /proc/slabinfo, it's not accounted by a
> > SLAB_RECLAIM_ACCOUNT cache, and it's not accounted by anything in
> > the core mm statistics.
> >
> > e.g. How does anyone other than a XFS expert know that the 500k of
> > active xfs_buf handles in the slab cache actually pins 15GB of
> > cached metadata allocated directly from the page allocator, not just
> > the 150MB of slab cache the handles take up?
> >
> > Another example is that an inode can pin lots of heap memory (e.g.
> > for in-memory extent lists) and that may not be freeable until the
> > inode is reclaimed. So while the slab cache might not be excesively
> > large, we might have an a million inodes with a billion cumulative
> > extents cached in memory and it is the heap memory consumed by the
> > cached extents that is consuming the 30GB of "missing" kernel memory
> > that is causing OOM-kills to occur.
> >
> > How is a user or developer supposed to know when one of these
> > situations has occurred given the current lack of memory usage
> > introspection into subsystems?
>
> What would be the proper solution to this problem from your point of view?
> What functionality/API mm can provide to make the life of fs developers
> better here?
What can we do better?
The first thing we can do better that comes to mind is to merge
Kent's patches that allow the shrinker owner to output debug
information when requested by the infrastructure.
Then we - the shrinker implementers - have some control of our own
destiny. We can add whatever we need to solve shrinker and OOM
problems realted to our shrinkers not doing the right thing.
But without that callout from the infrastructure and the
infrastructure to drive it at appropriate times, we will make zero
progress improving the situation.
Yes, the code may not be perfect and, yes, it may not be useful to
mm developers, but for the people who have to debug shrinker related
problems in production systems we need all the help we can get. We
certainly don't care if it isn't perfect, just having something we
can partially tailor to our iindividual needs is far, far better
than the current situation of nothing at all...
-Dave.
--
Dave Chinner
[email protected]
On Wed, Dec 06, 2023 at 07:16:04PM +1100, Dave Chinner wrote:
> On Fri, Dec 01, 2023 at 12:01:33PM -0800, Roman Gushchin wrote:
> > What would be the proper solution to this problem from your point of view?
> > What functionality/API mm can provide to make the life of fs developers
> > better here?
>
> What can we do better?
>
> The first thing we can do better that comes to mind is to merge
> Kent's patches that allow the shrinker owner to output debug
> information when requested by the infrastructure.
>
> Then we - the shrinker implementers - have some control of our own
> destiny. We can add whatever we need to solve shrinker and OOM
> problems realted to our shrinkers not doing the right thing.
>
> But without that callout from the infrastructure and the
> infrastructure to drive it at appropriate times, we will make zero
> progress improving the situation.
>
> Yes, the code may not be perfect and, yes, it may not be useful to
> mm developers, but for the people who have to debug shrinker related
> problems in production systems we need all the help we can get. We
> certainly don't care if it isn't perfect, just having something we
> can partially tailor to our iindividual needs is far, far better
> than the current situation of nothing at all...
Of course if mm people don't want it I've got better things to do than
fight uphill battles just to get some reviewed-by tags. Real high
quality review feedback in this thread.
On Wed, Dec 06, 2023 at 02:13:49PM -0500, Kent Overstreet wrote:
0;95;0c> On Wed, Dec 06, 2023 at 07:16:04PM +1100, Dave Chinner wrote:
> > On Fri, Dec 01, 2023 at 12:01:33PM -0800, Roman Gushchin wrote:
> > > What would be the proper solution to this problem from your point of view?
> > > What functionality/API mm can provide to make the life of fs developers
> > > better here?
> >
> > What can we do better?
> >
> > The first thing we can do better that comes to mind is to merge
> > Kent's patches that allow the shrinker owner to output debug
> > information when requested by the infrastructure.
> >
> > Then we - the shrinker implementers - have some control of our own
> > destiny. We can add whatever we need to solve shrinker and OOM
> > problems realted to our shrinkers not doing the right thing.
> >
> > But without that callout from the infrastructure and the
> > infrastructure to drive it at appropriate times, we will make zero
> > progress improving the situation.
> >
> > Yes, the code may not be perfect and, yes, it may not be useful to
> > mm developers, but for the people who have to debug shrinker related
> > problems in production systems we need all the help we can get. We
> > certainly don't care if it isn't perfect, just having something we
> > can partially tailor to our iindividual needs is far, far better
> > than the current situation of nothing at all...
>
> Of course if mm people don't want it I've got better things to do than
> fight uphill battles just to get some reviewed-by tags. Real high
> quality review feedback in this thread.
(ignoring an attack towards all mm people, sigh)
Kent, I think extending the shrinker debugfs interface is useful.
And in this context there is no need to limit the output to 10 items.
Also most of disagreements will vanish (sorry, if I missed something,
but looks like all concerns are related to the oom part).
Will it work for you?
If yes, would you be able to drop the oom part (at least for now)
and resend the patchset?
Thanks!
On Fri, Dec 08, 2023 at 05:44:16PM -0800, Roman Gushchin wrote:
> On Wed, Dec 06, 2023 at 02:13:49PM -0500, Kent Overstreet wrote:
> 0;95;0c> On Wed, Dec 06, 2023 at 07:16:04PM +1100, Dave Chinner wrote:
> > > On Fri, Dec 01, 2023 at 12:01:33PM -0800, Roman Gushchin wrote:
> > > > What would be the proper solution to this problem from your point of view?
> > > > What functionality/API mm can provide to make the life of fs developers
> > > > better here?
> > >
> > > What can we do better?
> > >
> > > The first thing we can do better that comes to mind is to merge
> > > Kent's patches that allow the shrinker owner to output debug
> > > information when requested by the infrastructure.
> > >
> > > Then we - the shrinker implementers - have some control of our own
> > > destiny. We can add whatever we need to solve shrinker and OOM
> > > problems realted to our shrinkers not doing the right thing.
> > >
> > > But without that callout from the infrastructure and the
> > > infrastructure to drive it at appropriate times, we will make zero
> > > progress improving the situation.
> > >
> > > Yes, the code may not be perfect and, yes, it may not be useful to
> > > mm developers, but for the people who have to debug shrinker related
> > > problems in production systems we need all the help we can get. We
> > > certainly don't care if it isn't perfect, just having something we
> > > can partially tailor to our iindividual needs is far, far better
> > > than the current situation of nothing at all...
> >
> > Of course if mm people don't want it I've got better things to do than
> > fight uphill battles just to get some reviewed-by tags. Real high
> > quality review feedback in this thread.
>
> (ignoring an attack towards all mm people, sigh)
>
> Kent, I think extending the shrinker debugfs interface is useful.
> And in this context there is no need to limit the output to 10 items.
> Also most of disagreements will vanish (sorry, if I missed something,
> but looks like all concerns are related to the oom part).
> Will it work for you?
>
> If yes, would you be able to drop the oom part (at least for now)
> and resend the patchset?
No: OOM debugging is the entire point of the patchset.