This series tries to improve visilibity of writeback. Patch 1 make
/sys/kernel/debug/bdi/xxx/stats show writeback info of whole bdi
instead of only writeback info in root cgroup. Patch 2 add a new
debug file /sys/kernel/debug/bdi/xxx/wb_stats to show per wb writeback
info. Patch 4 add wb_monitor.py to monitor basic writeback info
of running system, more info could be added on demand. Rest patches
are some random cleanups. More details can be found in respective
patches. Thanks!
Following domain hierarchy is tested:
global domain (320G)
/ \
cgroup domain1(10G) cgroup domain2(10G)
| |
bdi wb1 wb2
/* all writeback info of bdi is successfully collected */
# cat /sys/kernel/debug/bdi/252:16/stats:
BdiWriteback: 448 kB
BdiReclaimable: 1303904 kB
BdiDirtyThresh: 189914124 kB
DirtyThresh: 195337564 kB
BackgroundThresh: 32516508 kB
BdiDirtied: 3591392 kB
BdiWritten: 2287488 kB
BdiWriteBandwidth: 322248 kBps
b_dirty: 0
b_io: 0
b_more_io: 2
b_dirty_time: 0
bdi_list: 1
state: 1
/* per wb writeback info is collected */
# cat /sys/kernel/debug/bdi/252:16/wb_stats:
cat wb_stats
WbCgIno: 1
WbWriteback: 0 kB
WbReclaimable: 0 kB
WbDirtyThresh: 0 kB
WbDirtied: 0 kB
WbWritten: 0 kB
WbWriteBandwidth: 102400 kBps
b_dirty: 0
b_io: 0
b_more_io: 0
b_dirty_time: 0
state: 1
WbCgIno: 4284
WbWriteback: 448 kB
WbReclaimable: 818944 kB
WbDirtyThresh: 3096524 kB
WbDirtied: 2266880 kB
WbWritten: 1447936 kB
WbWriteBandwidth: 214036 kBps
b_dirty: 0
b_io: 0
b_more_io: 1
b_dirty_time: 0
state: 5
WbCgIno: 4325
WbWriteback: 224 kB
WbReclaimable: 819392 kB
WbDirtyThresh: 2920088 kB
WbDirtied: 2551808 kB
WbWritten: 1732416 kB
WbWriteBandwidth: 201832 kBps
b_dirty: 0
b_io: 0
b_more_io: 1
b_dirty_time: 0
state: 5
/* monitor writeback info */
# ./wb_monitor.py 252:16 -c
writeback reclaimable dirtied written avg_bw
252:16_1 0 0 0 0 102400
252:16_4284 672 820064 9230368 8410304 685612
252:16_4325 896 819840 10491264 9671648 652348
252:16 1568 1639904 19721632 18081952 1440360
writeback reclaimable dirtied written avg_bw
252:16_1 0 0 0 0 102400
252:16_4284 672 820064 9230368 8410304 685612
252:16_4325 896 819840 10491264 9671648 652348
252:16 1568 1639904 19721632 18081952 1440360
..
Kemeng Shi (6):
writeback: collect stats of all wb of bdi in bdi_debug_stats_show
writeback: support retrieving per group debug writeback stats of bdi
workqueue: remove unnecessary import and function in wq_monitor.py
writeback: add wb_monitor.py script to monitor writeback info on bdi
writeback: rename nr_reclaimable to nr_dirty in balance_dirty_pages
writeback: remove unneeded GDTC_INIT_NO_WB
include/linux/writeback.h | 1 +
mm/backing-dev.c | 159 ++++++++++++++++++++++++++-----
mm/page-writeback.c | 32 +++++--
tools/workqueue/wq_monitor.py | 9 +-
tools/writeback/wb_monitor.py | 172 ++++++++++++++++++++++++++++++++++
5 files changed, 334 insertions(+), 39 deletions(-)
create mode 100644 tools/writeback/wb_monitor.py
--
2.30.0
Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
of bdi.
Signed-off-by: Kemeng Shi <[email protected]>
---
include/linux/writeback.h | 1 +
mm/backing-dev.c | 66 +++++++++++++++++++++++++++++++++++++++
mm/page-writeback.c | 19 +++++++++++
3 files changed, 86 insertions(+)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9845cb62e40b..bb1ce1123b52 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -355,6 +355,7 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
+unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb);
void wb_update_bandwidth(struct bdi_writeback *wb);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 788702b6c5dd..bfc4079dc7fe 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -95,12 +95,77 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
collect_wb_stats(stats, wb);
mutex_unlock(&bdi->cgwb_release_mutex);
}
+static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
+ struct wb_stats *stats)
+{
+
+ seq_printf(m,
+ "WbCgIno: %10lu\n"
+ "WbWriteback: %10lu kB\n"
+ "WbReclaimable: %10lu kB\n"
+ "WbDirtyThresh: %10lu kB\n"
+ "WbDirtied: %10lu kB\n"
+ "WbWritten: %10lu kB\n"
+ "WbWriteBandwidth: %10lu kBps\n"
+ "b_dirty: %10lu\n"
+ "b_io: %10lu\n"
+ "b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
+ "state: %10lx\n",
+ cgroup_ino(wb->memcg_css->cgroup),
+ K(stats->nr_writeback),
+ K(stats->nr_reclaimable),
+ K(stats->wb_thresh),
+ K(stats->nr_dirtied),
+ K(stats->nr_written),
+ K(wb->avg_write_bandwidth),
+ stats->nr_dirty,
+ stats->nr_io,
+ stats->nr_more_io,
+ stats->nr_dirty_time,
+ wb->state);
+}
+
+static int cgwb_debug_stats_show(struct seq_file *m, void *v)
+{
+ struct backing_dev_info *bdi = m->private;
+ unsigned long background_thresh;
+ unsigned long dirty_thresh;
+ struct bdi_writeback *wb;
+ struct wb_stats stats;
+
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+
+ mutex_lock(&bdi->cgwb_release_mutex);
+ list_for_each_entry(wb, &bdi->wb_list, bdi_node) {
+ memset(&stats, 0, sizeof(stats));
+ stats.dirty_thresh = dirty_thresh;
+ collect_wb_stats(&stats, wb);
+
+ if (mem_cgroup_wb_domain(wb) != NULL)
+ stats.wb_thresh = min(stats.wb_thresh, wb_calc_cg_thresh(wb));
+
+ wb_stats_show(m, wb, &stats);
+ }
+ mutex_unlock(&bdi->cgwb_release_mutex);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
+
+static void cgwb_debug_register(struct backing_dev_info *bdi)
+{
+ debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
+ &cgwb_debug_stats_fops);
+}
#else
static void bdi_collect_stats(struct backing_dev_info *bdi,
struct wb_stats *stats)
{
collect_wb_stats(stats, &bdi->wb);
}
+
+static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
#endif
static int bdi_debug_stats_show(struct seq_file *m, void *v)
@@ -158,6 +223,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
&bdi_debug_stats_fops);
+ cgwb_debug_register(bdi);
}
static void bdi_debug_unregister(struct backing_dev_info *bdi)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e20467367fe..ba1b6b5ae5d6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
return __wb_calc_thresh(&gdtc, thresh);
}
+unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+ struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
+ unsigned long filepages, headroom, writeback;
+
+ gdtc.avail = global_dirtyable_memory();
+ gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_WRITEBACK);
+
+ mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc.dirty, &writeback);
+ mdtc.dirty += writeback;
+ mdtc_calc_avail(&mdtc, filepages, headroom);
+ domain_dirty_limits(&mdtc);
+
+ return __wb_calc_thresh(&mdtc, mdtc.thresh);
+}
+
/*
* setpoint - dirty 3
* f(dirty) := 1.0 + (----------------)
--
2.30.0
Add wb_monitor.py script to monitor writeback information on backing dev
which makes it easier and more convenient to observe writeback behaviors
of running system.
This script is based on copy of wq_monitor.py.
Signed-off-by: Kemeng Shi <[email protected]>
Suggested-by: Tejun Heo <[email protected]>
---
tools/writeback/wb_monitor.py | 172 ++++++++++++++++++++++++++++++++++
1 file changed, 172 insertions(+)
create mode 100644 tools/writeback/wb_monitor.py
diff --git a/tools/writeback/wb_monitor.py b/tools/writeback/wb_monitor.py
new file mode 100644
index 000000000000..5e3591f1f9a9
--- /dev/null
+++ b/tools/writeback/wb_monitor.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Kemeng Shi <[email protected]>
+# Copyright (C) 2024 Huawei Inc
+
+desc = """
+This is a drgn script based on wq_monitor.py to monitor writeback info on
+backing dev. For more info on drgn, visit https://github.com/osandov/drgn.
+
+ writeback(kB) Amount of dirty pages are currently being written back to
+ disk.
+
+ reclaimable(kB) Amount of pages are currently reclaimable.
+
+ dirtied(kB) Amount of pages have been dirtied.
+
+ wrttien(kB) Amount of dirty pages have been written back to disk.
+
+ avg_wb(kBps) Smoothly estimated write bandwidth of writing dirty pages
+ back to disk.
+"""
+
+import signal
+import re
+import time
+import json
+
+import drgn
+from drgn.helpers.linux.list import list_for_each_entry
+
+import argparse
+parser = argparse.ArgumentParser(description=desc,
+ formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument('bdi', metavar='REGEX', nargs='*',
+ help='Target backing device name patterns (all if empty)')
+parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
+ help='Monitoring interval (0 to print once and exit)')
+parser.add_argument('-j', '--json', action='store_true',
+ help='Output in json')
+parser.add_argument('-c', '--cgroup', action='store_true',
+ help='show writeback of bdi in cgroup')
+args = parser.parse_args()
+
+bdi_list = prog['bdi_list']
+
+WB_RECLAIMABLE = prog['WB_RECLAIMABLE']
+WB_WRITEBACK = prog['WB_WRITEBACK']
+WB_DIRTIED = prog['WB_DIRTIED']
+WB_WRITTEN = prog['WB_WRITTEN']
+NR_WB_STAT_ITEMS = prog['NR_WB_STAT_ITEMS']
+
+PAGE_SHIFT = prog['PAGE_SHIFT']
+
+def K(x):
+ return x << (PAGE_SHIFT - 10)
+
+class Stats:
+ def dict(self, now):
+ return { 'timestamp' : now,
+ 'name' : self.name,
+ 'writeback' : self.stats[WB_WRITEBACK],
+ 'reclaimable' : self.stats[WB_RECLAIMABLE],
+ 'dirtied' : self.stats[WB_DIRTIED],
+ 'written' : self.stats[WB_WRITTEN],
+ 'avg_wb' : self.avg_bw, }
+
+ def table_header_str():
+ return f'{"":>16} {"writeback":>10} {"reclaimable":>12} ' \
+ f'{"dirtied":>9} {"written":>9} {"avg_bw":>9}'
+
+ def table_row_str(self):
+ out = f'{self.name[-16:]:16} ' \
+ f'{self.stats[WB_WRITEBACK]:10} ' \
+ f'{self.stats[WB_RECLAIMABLE]:12} ' \
+ f'{self.stats[WB_DIRTIED]:9} ' \
+ f'{self.stats[WB_WRITTEN]:9} ' \
+ f'{self.avg_bw:9} '
+ return out
+
+ def show_header():
+ if Stats.table_fmt:
+ print()
+ print(Stats.table_header_str())
+
+ def show_stats(self):
+ if Stats.table_fmt:
+ print(self.table_row_str())
+ else:
+ print(self.dict(Stats.now))
+
+class WbStats(Stats):
+ def __init__(self, wb):
+ bdi_name = wb.bdi.dev_name.string_().decode()
+ # avoid to use bdi.wb.memcg_css which is only defined when
+ # CONFIG_CGROUP_WRITEBACK is enabled
+ if wb == wb.bdi.wb.address_of_():
+ ino = "1"
+ else:
+ ino = str(wb.memcg_css.cgroup.kn.id.value_())
+ self.name = bdi_name + '_' + ino
+
+ self.stats = [0] * NR_WB_STAT_ITEMS
+ for i in range(NR_WB_STAT_ITEMS):
+ if wb.stat[i].count >= 0:
+ self.stats[i] = int(K(wb.stat[i].count))
+ else:
+ self.stats[i] = 0
+
+ self.avg_bw = int(K(wb.avg_write_bandwidth))
+
+class BdiStats(Stats):
+ def __init__(self, bdi):
+ self.name = bdi.dev_name.string_().decode()
+ self.stats = [0] * NR_WB_STAT_ITEMS
+ self.avg_bw = 0
+
+ def collectStats(self, wb_stats):
+ for i in range(NR_WB_STAT_ITEMS):
+ self.stats[i] += wb_stats.stats[i]
+
+ self.avg_bw += wb_stats.avg_bw
+
+exit_req = False
+
+def sigint_handler(signr, frame):
+ global exit_req
+ exit_req = True
+
+def main():
+ # handle args
+ Stats.table_fmt = not args.json
+ interval = args.interval
+ cgroup = args.cgroup
+
+ re_str = None
+ if args.bdi:
+ for r in args.bdi:
+ if re_str is None:
+ re_str = r
+ else:
+ re_str += '|' + r
+
+ filter_re = re.compile(re_str) if re_str else None
+
+ # monitoring loop
+ signal.signal(signal.SIGINT, sigint_handler)
+
+ while not exit_req:
+ Stats.now = time.time()
+
+ Stats.show_header()
+ for bdi in list_for_each_entry('struct backing_dev_info', bdi_list.address_of_(), 'bdi_list'):
+ bdi_stats = BdiStats(bdi)
+ if filter_re and not filter_re.search(bdi_stats.name):
+ continue
+
+ for wb in list_for_each_entry('struct bdi_writeback', bdi.wb_list.address_of_(), 'bdi_node'):
+ wb_stats = WbStats(wb)
+ bdi_stats.collectStats(wb_stats)
+ if cgroup:
+ wb_stats.show_stats()
+
+ bdi_stats.show_stats()
+ if cgroup and Stats.table_fmt:
+ print()
+
+ if interval == 0:
+ break
+ time.sleep(interval)
+
+if __name__ == "__main__":
+ main()
--
2.30.0
Commit 8d92890bd6b85 ("mm/writeback: discard NR_UNSTABLE_NFS, use
NR_WRITEBACK instead") removed NR_UNSTABLE_NFS and nr_reclaimable
only contains dirty page now.
Rename nr_reclaimable to nr_dirty properly.
Signed-off-by: Kemeng Shi <[email protected]>
---
mm/page-writeback.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ba1b6b5ae5d6..481b6bf34c21 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1695,7 +1695,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty */
+ unsigned long nr_dirty;
long period;
long pause;
long max_pause;
@@ -1716,9 +1716,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
+ nr_dirty = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
+ gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK);
domain_dirty_limits(gdtc);
@@ -1769,7 +1769,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
- if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
+ if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
!writeback_in_progress(wb))
wb_start_background_writeback(wb);
--
2.30.0
We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
GDTC_INIT_NO_WB
Signed-off-by: Kemeng Shi <[email protected]>
---
mm/page-writeback.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 481b6bf34c21..09b2b0754cc5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -154,8 +154,6 @@ struct dirty_throttle_control {
.dom = &global_wb_domain, \
.wb_completions = &(__wb)->completions
-#define GDTC_INIT_NO_WB .dom = &global_wb_domain
-
#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
.dom = mem_cgroup_wb_domain(__wb), \
.wb_completions = &(__wb)->memcg_completions, \
@@ -210,7 +208,6 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
#define GDTC_INIT(__wb) .wb = (__wb), \
.wb_completions = &(__wb)->completions
-#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)
static bool mdtc_valid(struct dirty_throttle_control *dtc)
@@ -438,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
*/
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
- struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+ struct dirty_throttle_control gdtc = { };
gdtc.avail = global_dirtyable_memory();
domain_dirty_limits(&gdtc);
@@ -895,7 +892,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
{
- struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+ struct dirty_throttle_control gdtc = { };
struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
unsigned long filepages, headroom, writeback;
--
2.30.0
On Wed, Mar 20, 2024 at 07:02:18PM +0800, Kemeng Shi wrote:
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 9845cb62e40b..bb1ce1123b52 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -355,6 +355,7 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
>
> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
> unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
> +unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb);
Would cgwb_calc_thresh() be an easier name?
Thanks.
--
tejun
On Wed, Mar 20, 2024 at 07:02:20PM +0800, Kemeng Shi wrote:
> Add wb_monitor.py script to monitor writeback information on backing dev
> which makes it easier and more convenient to observe writeback behaviors
> of running system.
>
> This script is based on copy of wq_monitor.py.
I don't think I did that when wq_monitor.py was added but would you mind
adding an example output so that people can have a better idea on what to
expect?
Thanks.
--
tejun
Hello,
On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
> GDTC_INIT_NO_WB
>
> Signed-off-by: Kemeng Shi <[email protected]>
..
> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> {
> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> + struct dirty_throttle_control gdtc = { };
Even if it's currently not referenced, wouldn't it still be better to always
guarantee that a dtc's dom is always initialized? I'm not sure what we get
by removing this.
Thanks.
--
tejun
Hello,
On Wed, Mar 20, 2024 at 07:02:16PM +0800, Kemeng Shi wrote:
> /* all writeback info of bdi is successfully collected */
> # cat /sys/kernel/debug/bdi/252:16/stats:
> BdiWriteback: 448 kB
..
>
> /* per wb writeback info is collected */
> # cat /sys/kernel/debug/bdi/252:16/wb_stats:
> cat wb_stats
> WbCgIno: 1
..
>
> /* monitor writeback info */
> # ./wb_monitor.py 252:16 -c
> writeback reclaimable dirtied written avg_bw
> 252:16_1 0 0 0 0 102400
> 252:16_4284 672 820064 9230368 8410304 685612
> 252:16_4325 896 819840 10491264 9671648 652348
> 252:16 1568 1639904 19721632 18081952 1440360
>
>
> writeback reclaimable dirtied written avg_bw
> 252:16_1 0 0 0 0 102400
> 252:16_4284 672 820064 9230368 8410304 685612
> 252:16_4325 896 819840 10491264 9671648 652348
> 252:16 1568 1639904 19721632 18081952 1440360
> ...
Ah you have the example outputs here. It'd be nice to have these in the
commit messages too.
I'm not sure about the last patch but patchset looks good to me otherwise. I
don't feel particularly enthusiastic about adding more debugfs files
especially given that some distros disable debugfs completely, but no harm
in adding them either.
Thanks.
--
tejun
On Wed 20-03-24 19:02:16, Kemeng Shi wrote:
> This series tries to improve visilibity of writeback. Patch 1 make
> /sys/kernel/debug/bdi/xxx/stats show writeback info of whole bdi
> instead of only writeback info in root cgroup. Patch 2 add a new
> debug file /sys/kernel/debug/bdi/xxx/wb_stats to show per wb writeback
> info. Patch 4 add wb_monitor.py to monitor basic writeback info
> of running system, more info could be added on demand. Rest patches
> are some random cleanups. More details can be found in respective
> patches. Thanks!
>
> Following domain hierarchy is tested:
> global domain (320G)
> / \
> cgroup domain1(10G) cgroup domain2(10G)
> | |
> bdi wb1 wb2
>
> /* all writeback info of bdi is successfully collected */
> # cat /sys/kernel/debug/bdi/252:16/stats:
> BdiWriteback: 448 kB
> BdiReclaimable: 1303904 kB
> BdiDirtyThresh: 189914124 kB
> DirtyThresh: 195337564 kB
> BackgroundThresh: 32516508 kB
> BdiDirtied: 3591392 kB
> BdiWritten: 2287488 kB
> BdiWriteBandwidth: 322248 kBps
> b_dirty: 0
> b_io: 0
> b_more_io: 2
> b_dirty_time: 0
> bdi_list: 1
> state: 1
>
> /* per wb writeback info is collected */
> # cat /sys/kernel/debug/bdi/252:16/wb_stats:
> cat wb_stats
> WbCgIno: 1
> WbWriteback: 0 kB
> WbReclaimable: 0 kB
> WbDirtyThresh: 0 kB
> WbDirtied: 0 kB
> WbWritten: 0 kB
> WbWriteBandwidth: 102400 kBps
> b_dirty: 0
> b_io: 0
> b_more_io: 0
> b_dirty_time: 0
> state: 1
> WbCgIno: 4284
> WbWriteback: 448 kB
> WbReclaimable: 818944 kB
> WbDirtyThresh: 3096524 kB
> WbDirtied: 2266880 kB
> WbWritten: 1447936 kB
> WbWriteBandwidth: 214036 kBps
> b_dirty: 0
> b_io: 0
> b_more_io: 1
> b_dirty_time: 0
> state: 5
> WbCgIno: 4325
> WbWriteback: 224 kB
> WbReclaimable: 819392 kB
> WbDirtyThresh: 2920088 kB
> WbDirtied: 2551808 kB
> WbWritten: 1732416 kB
> WbWriteBandwidth: 201832 kBps
> b_dirty: 0
> b_io: 0
> b_more_io: 1
> b_dirty_time: 0
> state: 5
>
> /* monitor writeback info */
> # ./wb_monitor.py 252:16 -c
> writeback reclaimable dirtied written avg_bw
> 252:16_1 0 0 0 0 102400
> 252:16_4284 672 820064 9230368 8410304 685612
> 252:16_4325 896 819840 10491264 9671648 652348
> 252:16 1568 1639904 19721632 18081952 1440360
>
>
> writeback reclaimable dirtied written avg_bw
> 252:16_1 0 0 0 0 102400
> 252:16_4284 672 820064 9230368 8410304 685612
> 252:16_4325 896 819840 10491264 9671648 652348
> 252:16 1568 1639904 19721632 18081952 1440360
> ...
So I'm wondering: Are you implementing this just because this looks
interesting or do you have a real need for the functionality? Why?
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
on 3/20/2024 11:01 PM, Tejun Heo wrote:
> On Wed, Mar 20, 2024 at 07:02:18PM +0800, Kemeng Shi wrote:
>> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
>> index 9845cb62e40b..bb1ce1123b52 100644
>> --- a/include/linux/writeback.h
>> +++ b/include/linux/writeback.h
>> @@ -355,6 +355,7 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
>>
>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
>> unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
>> +unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb);
>
> Would cgwb_calc_thresh() be an easier name?
>
Sure, will rename it in next version. Thansk!
> Thanks.
>
on 3/20/2024 11:12 PM, Tejun Heo wrote:
> On Wed, Mar 20, 2024 at 07:02:20PM +0800, Kemeng Shi wrote:
>> Add wb_monitor.py script to monitor writeback information on backing dev
>> which makes it easier and more convenient to observe writeback behaviors
>> of running system.
>>
>> This script is based on copy of wq_monitor.py.
>
> I don't think I did that when wq_monitor.py was added but would you mind
> adding an example output so that people can have a better idea on what to
> expect?
Sorry for the confusion. What I mean is that wb_monitor.py is initially copy
from wq_monitor.py and then is modified to monitor writeback info. I will
correct the description and add the example outputs.
Thanks
>
> Thanks.
>
on 3/20/2024 11:15 PM, Tejun Heo wrote:
> Hello,
>
> On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
>> GDTC_INIT_NO_WB
>>
>> Signed-off-by: Kemeng Shi <[email protected]>
> ...
>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
>> {
>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>> + struct dirty_throttle_control gdtc = { };
>
> Even if it's currently not referenced, wouldn't it still be better to always
> guarantee that a dtc's dom is always initialized? I'm not sure what we get
> by removing this.
As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
calculating dirty limit with domain_dirty_limits, I intuitively think the dirty
limit calculation in domain_dirty_limits is related to global_wb_domain when
CONFIG_WRITEBACK_CGROUP is enabled while the truth is not. So this is a little
confusing to me.
Would it be acceptable to you that we keep useing GDTC_INIT_NO_WB but
define GDTC_INIT_NO_WB to null fow now and redefine GDTC_INIT_NO_WB when some
member of gdtc is really needed.
Of couse I'm not insistent on this. Would like to hear you suggestion. Thanks!
>
> Thanks.
>
on 3/21/2024 1:22 AM, Jan Kara wrote:
> On Wed 20-03-24 19:02:16, Kemeng Shi wrote:
>> This series tries to improve visilibity of writeback. Patch 1 make
>> /sys/kernel/debug/bdi/xxx/stats show writeback info of whole bdi
>> instead of only writeback info in root cgroup. Patch 2 add a new
>> debug file /sys/kernel/debug/bdi/xxx/wb_stats to show per wb writeback
>> info. Patch 4 add wb_monitor.py to monitor basic writeback info
>> of running system, more info could be added on demand. Rest patches
>> are some random cleanups. More details can be found in respective
>> patches. Thanks!
>>
>> Following domain hierarchy is tested:
>> global domain (320G)
>> / \
>> cgroup domain1(10G) cgroup domain2(10G)
>> | |
>> bdi wb1 wb2
>>
>> /* all writeback info of bdi is successfully collected */
>> # cat /sys/kernel/debug/bdi/252:16/stats:
>> BdiWriteback: 448 kB
>> BdiReclaimable: 1303904 kB
>> BdiDirtyThresh: 189914124 kB
>> DirtyThresh: 195337564 kB
>> BackgroundThresh: 32516508 kB
>> BdiDirtied: 3591392 kB
>> BdiWritten: 2287488 kB
>> BdiWriteBandwidth: 322248 kBps
>> b_dirty: 0
>> b_io: 0
>> b_more_io: 2
>> b_dirty_time: 0
>> bdi_list: 1
>> state: 1
>>
>> /* per wb writeback info is collected */
>> # cat /sys/kernel/debug/bdi/252:16/wb_stats:
>> cat wb_stats
>> WbCgIno: 1
>> WbWriteback: 0 kB
>> WbReclaimable: 0 kB
>> WbDirtyThresh: 0 kB
>> WbDirtied: 0 kB
>> WbWritten: 0 kB
>> WbWriteBandwidth: 102400 kBps
>> b_dirty: 0
>> b_io: 0
>> b_more_io: 0
>> b_dirty_time: 0
>> state: 1
>> WbCgIno: 4284
>> WbWriteback: 448 kB
>> WbReclaimable: 818944 kB
>> WbDirtyThresh: 3096524 kB
>> WbDirtied: 2266880 kB
>> WbWritten: 1447936 kB
>> WbWriteBandwidth: 214036 kBps
>> b_dirty: 0
>> b_io: 0
>> b_more_io: 1
>> b_dirty_time: 0
>> state: 5
>> WbCgIno: 4325
>> WbWriteback: 224 kB
>> WbReclaimable: 819392 kB
>> WbDirtyThresh: 2920088 kB
>> WbDirtied: 2551808 kB
>> WbWritten: 1732416 kB
>> WbWriteBandwidth: 201832 kBps
>> b_dirty: 0
>> b_io: 0
>> b_more_io: 1
>> b_dirty_time: 0
>> state: 5
>>
>> /* monitor writeback info */
>> # ./wb_monitor.py 252:16 -c
>> writeback reclaimable dirtied written avg_bw
>> 252:16_1 0 0 0 0 102400
>> 252:16_4284 672 820064 9230368 8410304 685612
>> 252:16_4325 896 819840 10491264 9671648 652348
>> 252:16 1568 1639904 19721632 18081952 1440360
>>
>>
>> writeback reclaimable dirtied written avg_bw
>> 252:16_1 0 0 0 0 102400
>> 252:16_4284 672 820064 9230368 8410304 685612
>> 252:16_4325 896 819840 10491264 9671648 652348
>> 252:16 1568 1639904 19721632 18081952 1440360
>> ...
>
> So I'm wondering: Are you implementing this just because this looks
> interesting or do you have a real need for the functionality? Why?
Hi Jan, I added debug files to test change in [1] which changes the way how
dirty background threshold of wb is calculated. Without debug files, we could
only monitor writeback to imply that threshold is corrected.
In current patchset, debug info has not included dirty background threshold yet,
I will add it when discution of calculation of dirty background threshold in [1]
is done.
The wb_monitor.py is suggested by Tejun in [2] to improve visibility of writeback.
The script is more convenient than trace to monitor writeback behavior of the running
system.
Thanks
[1] https://lore.kernel.org/lkml/[email protected]/
[2] https://lore.kernel.org/lkml/[email protected]/
>
> Honza
>
On Thu 21-03-24 16:12:52, Kemeng Shi wrote:
> on 3/21/2024 1:22 AM, Jan Kara wrote:
> > On Wed 20-03-24 19:02:16, Kemeng Shi wrote:
> >> This series tries to improve visilibity of writeback. Patch 1 make
> >> /sys/kernel/debug/bdi/xxx/stats show writeback info of whole bdi
> >> instead of only writeback info in root cgroup. Patch 2 add a new
> >> debug file /sys/kernel/debug/bdi/xxx/wb_stats to show per wb writeback
> >> info. Patch 4 add wb_monitor.py to monitor basic writeback info
> >> of running system, more info could be added on demand. Rest patches
> >> are some random cleanups. More details can be found in respective
> >> patches. Thanks!
> >>
> >> Following domain hierarchy is tested:
> >> global domain (320G)
> >> / \
> >> cgroup domain1(10G) cgroup domain2(10G)
> >> | |
> >> bdi wb1 wb2
> >>
> >> /* all writeback info of bdi is successfully collected */
> >> # cat /sys/kernel/debug/bdi/252:16/stats:
> >> BdiWriteback: 448 kB
> >> BdiReclaimable: 1303904 kB
> >> BdiDirtyThresh: 189914124 kB
> >> DirtyThresh: 195337564 kB
> >> BackgroundThresh: 32516508 kB
> >> BdiDirtied: 3591392 kB
> >> BdiWritten: 2287488 kB
> >> BdiWriteBandwidth: 322248 kBps
> >> b_dirty: 0
> >> b_io: 0
> >> b_more_io: 2
> >> b_dirty_time: 0
> >> bdi_list: 1
> >> state: 1
> >>
> >> /* per wb writeback info is collected */
> >> # cat /sys/kernel/debug/bdi/252:16/wb_stats:
> >> cat wb_stats
> >> WbCgIno: 1
> >> WbWriteback: 0 kB
> >> WbReclaimable: 0 kB
> >> WbDirtyThresh: 0 kB
> >> WbDirtied: 0 kB
> >> WbWritten: 0 kB
> >> WbWriteBandwidth: 102400 kBps
> >> b_dirty: 0
> >> b_io: 0
> >> b_more_io: 0
> >> b_dirty_time: 0
> >> state: 1
> >> WbCgIno: 4284
> >> WbWriteback: 448 kB
> >> WbReclaimable: 818944 kB
> >> WbDirtyThresh: 3096524 kB
> >> WbDirtied: 2266880 kB
> >> WbWritten: 1447936 kB
> >> WbWriteBandwidth: 214036 kBps
> >> b_dirty: 0
> >> b_io: 0
> >> b_more_io: 1
> >> b_dirty_time: 0
> >> state: 5
> >> WbCgIno: 4325
> >> WbWriteback: 224 kB
> >> WbReclaimable: 819392 kB
> >> WbDirtyThresh: 2920088 kB
> >> WbDirtied: 2551808 kB
> >> WbWritten: 1732416 kB
> >> WbWriteBandwidth: 201832 kBps
> >> b_dirty: 0
> >> b_io: 0
> >> b_more_io: 1
> >> b_dirty_time: 0
> >> state: 5
> >>
> >> /* monitor writeback info */
> >> # ./wb_monitor.py 252:16 -c
> >> writeback reclaimable dirtied written avg_bw
> >> 252:16_1 0 0 0 0 102400
> >> 252:16_4284 672 820064 9230368 8410304 685612
> >> 252:16_4325 896 819840 10491264 9671648 652348
> >> 252:16 1568 1639904 19721632 18081952 1440360
> >>
> >>
> >> writeback reclaimable dirtied written avg_bw
> >> 252:16_1 0 0 0 0 102400
> >> 252:16_4284 672 820064 9230368 8410304 685612
> >> 252:16_4325 896 819840 10491264 9671648 652348
> >> 252:16 1568 1639904 19721632 18081952 1440360
> >> ...
> >
> > So I'm wondering: Are you implementing this just because this looks
> > interesting or do you have a real need for the functionality? Why?
> Hi Jan, I added debug files to test change in [1] which changes the way how
> dirty background threshold of wb is calculated. Without debug files, we could
> only monitor writeback to imply that threshold is corrected.
> In current patchset, debug info has not included dirty background threshold yet,
> I will add it when discution of calculation of dirty background threshold in [1]
> is done.
> The wb_monitor.py is suggested by Tejun in [2] to improve visibility of writeback.
> The script is more convenient than trace to monitor writeback behavior of the running
> system.
Thanks for the pointer. OK, I agree this is useful so let's have a look
into the code :)
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu, Mar 21, 2024 at 03:12:21PM +0800, Kemeng Shi wrote:
>
>
> on 3/20/2024 11:15 PM, Tejun Heo wrote:
> > Hello,
> >
> > On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
> >> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
> >> GDTC_INIT_NO_WB
> >>
> >> Signed-off-by: Kemeng Shi <[email protected]>
> > ...
> >> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> >> {
> >> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> >> + struct dirty_throttle_control gdtc = { };
> >
> > Even if it's currently not referenced, wouldn't it still be better to always
> > guarantee that a dtc's dom is always initialized? I'm not sure what we get
> > by removing this.
> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
> calculating dirty limit with domain_dirty_limits, I intuitively think the dirty
> limit calculation in domain_dirty_limits is related to global_wb_domain when
> CONFIG_WRITEBACK_CGROUP is enabled while the truth is not. So this is a little
> confusing to me.
> Would it be acceptable to you that we keep useing GDTC_INIT_NO_WB but
> define GDTC_INIT_NO_WB to null fow now and redefine GDTC_INIT_NO_WB when some
> member of gdtc is really needed.
> Of couse I'm not insistent on this. Would like to hear you suggestion. Thanks!
Ah, I see. In that case, the proposed change of removing GDTC_INIT_NO_WB
looks good to me.
Thanks.
--
tejun
On Wed 20-03-24 19:02:18, Kemeng Shi wrote:
> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
> of bdi.
>
> Signed-off-by: Kemeng Shi <[email protected]>
..
> +unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
> +{
> + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
> + unsigned long filepages, headroom, writeback;
> +
> + gdtc.avail = global_dirtyable_memory();
> + gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> + global_node_page_state(NR_WRITEBACK);
> +
> + mem_cgroup_wb_stats(wb, &filepages, &headroom,
> + &mdtc.dirty, &writeback);
> + mdtc.dirty += writeback;
> + mdtc_calc_avail(&mdtc, filepages, headroom);
> + domain_dirty_limits(&mdtc);
> +
> + return __wb_calc_thresh(&mdtc, mdtc.thresh);
> +}
I kind of wish we didn't replicate this logic from balance_dirty_pages()
and wb_over_bg_thresh() into yet another place. But the refactoring would
be kind of difficult. So OK.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 20-03-24 19:02:21, Kemeng Shi wrote:
> Commit 8d92890bd6b85 ("mm/writeback: discard NR_UNSTABLE_NFS, use
> NR_WRITEBACK instead") removed NR_UNSTABLE_NFS and nr_reclaimable
> only contains dirty page now.
> Rename nr_reclaimable to nr_dirty properly.
>
> Signed-off-by: Kemeng Shi <[email protected]>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> mm/page-writeback.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index ba1b6b5ae5d6..481b6bf34c21 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1695,7 +1695,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
> struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
> &mdtc_stor : NULL;
> struct dirty_throttle_control *sdtc;
> - unsigned long nr_reclaimable; /* = file_dirty */
> + unsigned long nr_dirty;
> long period;
> long pause;
> long max_pause;
> @@ -1716,9 +1716,9 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
> unsigned long m_thresh = 0;
> unsigned long m_bg_thresh = 0;
>
> - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
> + nr_dirty = global_node_page_state(NR_FILE_DIRTY);
> gdtc->avail = global_dirtyable_memory();
> - gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
> + gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK);
>
> domain_dirty_limits(gdtc);
>
> @@ -1769,7 +1769,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb,
> * In normal mode, we start background writeout at the lower
> * background_thresh, to keep the amount of dirty memory low.
> */
> - if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
> + if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
> !writeback_in_progress(wb))
> wb_start_background_writeback(wb);
>
> --
> 2.30.0
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Wed 20-03-24 19:02:22, Kemeng Shi wrote:
> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
> GDTC_INIT_NO_WB
>
> Signed-off-by: Kemeng Shi <[email protected]>
Please no, this leaves a trap for the future. If anything, I'd teach
GDTC_INIT() that 'wb' can be NULL and replace GDTC_INIT_NO_WB with
GDTC_INIT(NULL).
Honza
> ---
> mm/page-writeback.c | 7 ++-----
> 1 file changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 481b6bf34c21..09b2b0754cc5 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -154,8 +154,6 @@ struct dirty_throttle_control {
> .dom = &global_wb_domain, \
> .wb_completions = &(__wb)->completions
>
> -#define GDTC_INIT_NO_WB .dom = &global_wb_domain
> -
> #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
> .dom = mem_cgroup_wb_domain(__wb), \
> .wb_completions = &(__wb)->memcg_completions, \
> @@ -210,7 +208,6 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
>
> #define GDTC_INIT(__wb) .wb = (__wb), \
> .wb_completions = &(__wb)->completions
> -#define GDTC_INIT_NO_WB
> #define MDTC_INIT(__wb, __gdtc)
>
> static bool mdtc_valid(struct dirty_throttle_control *dtc)
> @@ -438,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
> */
> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> {
> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> + struct dirty_throttle_control gdtc = { };
>
> gdtc.avail = global_dirtyable_memory();
> domain_dirty_limits(&gdtc);
> @@ -895,7 +892,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
>
> unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
> {
> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> + struct dirty_throttle_control gdtc = { };
> struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
> unsigned long filepages, headroom, writeback;
>
> --
> 2.30.0
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
on 3/26/2024 4:26 AM, Tejun Heo wrote:
> On Thu, Mar 21, 2024 at 03:12:21PM +0800, Kemeng Shi wrote:
>>
>>
>> on 3/20/2024 11:15 PM, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
>>>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
>>>> GDTC_INIT_NO_WB
>>>>
>>>> Signed-off-by: Kemeng Shi <[email protected]>
>>> ...
>>>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
>>>> {
>>>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>>>> + struct dirty_throttle_control gdtc = { };
>>>
>>> Even if it's currently not referenced, wouldn't it still be better to always
>>> guarantee that a dtc's dom is always initialized? I'm not sure what we get
>>> by removing this.
>> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
>> calculating dirty limit with domain_dirty_limits, I intuitively think the dirty
>> limit calculation in domain_dirty_limits is related to global_wb_domain when
>> CONFIG_WRITEBACK_CGROUP is enabled while the truth is not. So this is a little
>> confusing to me.
>> Would it be acceptable to you that we keep useing GDTC_INIT_NO_WB but
>> define GDTC_INIT_NO_WB to null fow now and redefine GDTC_INIT_NO_WB when some
>> member of gdtc is really needed.
>> Of couse I'm not insistent on this. Would like to hear you suggestion. Thanks!
>
> Ah, I see. In that case, the proposed change of removing GDTC_INIT_NO_WB
> looks good to me.
Sure, will do it in next version. Thanks!
>
> Thanks.
>
on 3/26/2024 8:35 PM, Jan Kara wrote:
> On Wed 20-03-24 19:02:22, Kemeng Shi wrote:
>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
>> GDTC_INIT_NO_WB
>>
>> Signed-off-by: Kemeng Shi <[email protected]>
>
> Please no, this leaves a trap for the future. If anything, I'd teach
> GDTC_INIT() that 'wb' can be NULL and replace GDTC_INIT_NO_WB with
> GDTC_INIT(NULL).
Would it be acceptable to define GDTC_INIT_NO_WB to null for now as
discussed in [1].
[1] https://lore.kernel.org/lkml/[email protected]/
Thanks,
Kemeng
>
> Honza
>
>> ---
>> mm/page-writeback.c | 7 ++-----
>> 1 file changed, 2 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
>> index 481b6bf34c21..09b2b0754cc5 100644
>> --- a/mm/page-writeback.c
>> +++ b/mm/page-writeback.c
>> @@ -154,8 +154,6 @@ struct dirty_throttle_control {
>> .dom = &global_wb_domain, \
>> .wb_completions = &(__wb)->completions
>>
>> -#define GDTC_INIT_NO_WB .dom = &global_wb_domain
>> -
>> #define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \
>> .dom = mem_cgroup_wb_domain(__wb), \
>> .wb_completions = &(__wb)->memcg_completions, \
>> @@ -210,7 +208,6 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
>>
>> #define GDTC_INIT(__wb) .wb = (__wb), \
>> .wb_completions = &(__wb)->completions
>> -#define GDTC_INIT_NO_WB
>> #define MDTC_INIT(__wb, __gdtc)
>>
>> static bool mdtc_valid(struct dirty_throttle_control *dtc)
>> @@ -438,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
>> */
>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
>> {
>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>> + struct dirty_throttle_control gdtc = { };
>>
>> gdtc.avail = global_dirtyable_memory();
>> domain_dirty_limits(&gdtc);
>> @@ -895,7 +892,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
>>
>> unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
>> {
>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>> + struct dirty_throttle_control gdtc = { };
>> struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
>> unsigned long filepages, headroom, writeback;
>>
>> --
>> 2.30.0
>>
on 3/26/2024 8:24 PM, Jan Kara wrote:
> On Wed 20-03-24 19:02:18, Kemeng Shi wrote:
>> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
>> of bdi.
>>
>> Signed-off-by: Kemeng Shi <[email protected]>
>
> ...
>
>> +unsigned long wb_calc_cg_thresh(struct bdi_writeback *wb)
>> +{
>> + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>> + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
>> + unsigned long filepages, headroom, writeback;
>> +
>> + gdtc.avail = global_dirtyable_memory();
>> + gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
>> + global_node_page_state(NR_WRITEBACK);
>> +
>> + mem_cgroup_wb_stats(wb, &filepages, &headroom,
>> + &mdtc.dirty, &writeback);
>> + mdtc.dirty += writeback;
>> + mdtc_calc_avail(&mdtc, filepages, headroom);
>> + domain_dirty_limits(&mdtc);
>> +
>> + return __wb_calc_thresh(&mdtc, mdtc.thresh);
>> +}
>
> I kind of wish we didn't replicate this logic from balance_dirty_pages()
> and wb_over_bg_thresh() into yet another place. But the refactoring would
> be kind of difficult. So OK.
Thanks for review the code.
I have considered about this. It's difficult and will introduce a lot
change to non-debug code which make this series uneasy to review.
I will submit another patch for refactoring if I could find a way to
clean the code.
Kemeng
>
> Honza
>
On Thu 21-03-24 15:12:21, Kemeng Shi wrote:
>
>
> on 3/20/2024 11:15 PM, Tejun Heo wrote:
> > Hello,
> >
> > On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
> >> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
> >> GDTC_INIT_NO_WB
> >>
> >> Signed-off-by: Kemeng Shi <[email protected]>
> > ...
> >> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> >> {
> >> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> >> + struct dirty_throttle_control gdtc = { };
> >
> > Even if it's currently not referenced, wouldn't it still be better to always
> > guarantee that a dtc's dom is always initialized? I'm not sure what we get
> > by removing this.
> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
> calculating dirty limit with domain_dirty_limits, I intuitively think the
> dirty limit calculation in domain_dirty_limits is related to
> global_wb_domain when CONFIG_WRITEBACK_CGROUP is enabled while the truth
> is not. So this is a little confusing to me.
I'm not sure I understand your confusion. domain_dirty_limits() calculates
the dirty limit (and background dirty limit) for the dirty_throttle_control
passed in. If you pass dtc initialized with GDTC_INIT[_NO_WB], it will
compute global dirty limits. If the dtc passed in is initialized with
MDTC_INIT() it will compute cgroup specific dirty limits.
Now because domain_dirty_limits() does not scale the limits based on each
device throughput - that is done only later in __wb_calc_thresh() to avoid
relatively expensive computations when we don't need them - and also
because the effective dirty limit (dtc->dom->dirty_limit) is not updated by
domain_dirty_limits(), domain_dirty_limits() does not need dtc->dom at all.
But that is a technical detail of implementation and I don't want this
technical detail to be relied on by even more code.
What might have confused you is that GDTC_INIT_NO_WB is defined to be empty
when CONFIG_CGROUP_WRITEBACK is disabled. But this is only because in that
case dtc_dom() function unconditionally returns global_wb_domain so we
don't bother with initializing (or even having) the 'dom' field anywhere.
Now I agree this whole code is substantially confusing and complex and it
would all deserve some serious thought how to make it more readable. But
even after thinking about it again I don't think removing GDTC_INIT_NO_WB is
the right way to go.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
on 3/27/2024 5:33 PM, Jan Kara wrote:
> On Thu 21-03-24 15:12:21, Kemeng Shi wrote:
>>
>>
>> on 3/20/2024 11:15 PM, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
>>>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
>>>> GDTC_INIT_NO_WB
>>>>
>>>> Signed-off-by: Kemeng Shi <[email protected]>
>>> ...
>>>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
>>>> {
>>>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>>>> + struct dirty_throttle_control gdtc = { };
>>>
>>> Even if it's currently not referenced, wouldn't it still be better to always
>>> guarantee that a dtc's dom is always initialized? I'm not sure what we get
>>> by removing this.
>> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
>> calculating dirty limit with domain_dirty_limits, I intuitively think the
>> dirty limit calculation in domain_dirty_limits is related to
>> global_wb_domain when CONFIG_WRITEBACK_CGROUP is enabled while the truth
>> is not. So this is a little confusing to me.
>
Hi Jan,
> I'm not sure I understand your confusion. domain_dirty_limits() calculates
> the dirty limit (and background dirty limit) for the dirty_throttle_control
> passed in. If you pass dtc initialized with GDTC_INIT[_NO_WB], it will
> compute global dirty limits. If the dtc passed in is initialized with
> MDTC_INIT() it will compute cgroup specific dirty limits.
No doubt about this.
>
> Now because domain_dirty_limits() does not scale the limits based on each
> device throughput - that is done only later in __wb_calc_thresh() to avoid> relatively expensive computations when we don't need them - and also
> because the effective dirty limit (dtc->dom->dirty_limit) is not updated by
> domain_dirty_limits(), domain_dirty_limits() does not need dtc->dom at all.
Acutally, here is the thing confusing me. For wb_calc_thresh, we always pass
dtc initialized with a wb (GDTC_INIT(wb) or MDTC_INIT(wb,..). The dtc
initialized with _NO_WB is only passed to domain_dirty_limits. However, The
dom initialized by _NO_WB for domain_dirty_limits is not needed at all.
> But that is a technical detail of implementation and I don't want this
> technical detail to be relied on by even more code.
Yes, I agree with this. So I wonder if it's acceptable to simply define
GDTC_INIT_NO_WB to empty for now instead of remove defination of
GDTC_INIT_NO_WB. When implementation of domain_dirty_limits() or any
other low level function in future using GDTC_INIT(_NO_WB) changes to
need dtc->domain, we re-define GDTC_INIT_NO_WB to proper value.
As this only looks confusing to me. I will drop this one in next version
if you still prefer to keep definatino of GDTC_INIT_NO_WB in the old way.
Thanks,
Kemeng
>
> What might have confused you is that GDTC_INIT_NO_WB is defined to be empty
> when CONFIG_CGROUP_WRITEBACK is disabled. But this is only because in that
> case dtc_dom() function unconditionally returns global_wb_domain so we
> don't bother with initializing (or even having) the 'dom' field anywhere.
>
> Now I agree this whole code is substantially confusing and complex and it
> would all deserve some serious thought how to make it more readable. But
> even after thinking about it again I don't think removing GDTC_INIT_NO_WB is
> the right way to go.
>
> Honza
>
On Thu 28-03-24 09:49:59, Kemeng Shi wrote:
> on 3/27/2024 5:33 PM, Jan Kara wrote:
> > On Thu 21-03-24 15:12:21, Kemeng Shi wrote:
> >>
> >>
> >> on 3/20/2024 11:15 PM, Tejun Heo wrote:
> >>> Hello,
> >>>
> >>> On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
> >>>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
> >>>> GDTC_INIT_NO_WB
> >>>>
> >>>> Signed-off-by: Kemeng Shi <[email protected]>
> >>> ...
> >>>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
> >>>> {
> >>>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> >>>> + struct dirty_throttle_control gdtc = { };
> >>>
> >>> Even if it's currently not referenced, wouldn't it still be better to always
> >>> guarantee that a dtc's dom is always initialized? I'm not sure what we get
> >>> by removing this.
> >> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
> >> calculating dirty limit with domain_dirty_limits, I intuitively think the
> >> dirty limit calculation in domain_dirty_limits is related to
> >> global_wb_domain when CONFIG_WRITEBACK_CGROUP is enabled while the truth
> >> is not. So this is a little confusing to me.
> >
> Hi Jan,
> > I'm not sure I understand your confusion. domain_dirty_limits() calculates
> > the dirty limit (and background dirty limit) for the dirty_throttle_control
> > passed in. If you pass dtc initialized with GDTC_INIT[_NO_WB], it will
> > compute global dirty limits. If the dtc passed in is initialized with
> > MDTC_INIT() it will compute cgroup specific dirty limits.
> No doubt about this.
> >
> > Now because domain_dirty_limits() does not scale the limits based on each
> > device throughput - that is done only later in __wb_calc_thresh() to avoid> relatively expensive computations when we don't need them - and also
> > because the effective dirty limit (dtc->dom->dirty_limit) is not updated by
> > domain_dirty_limits(), domain_dirty_limits() does not need dtc->dom at all.
> Acutally, here is the thing confusing me. For wb_calc_thresh, we always pass
> dtc initialized with a wb (GDTC_INIT(wb) or MDTC_INIT(wb,..). The dtc
> initialized with _NO_WB is only passed to domain_dirty_limits. However, The
> dom initialized by _NO_WB for domain_dirty_limits is not needed at all.
> > But that is a technical detail of implementation and I don't want this
> > technical detail to be relied on by even more code.
> Yes, I agree with this. So I wonder if it's acceptable to simply define
> GDTC_INIT_NO_WB to empty for now instead of remove defination of
> GDTC_INIT_NO_WB. When implementation of domain_dirty_limits() or any
> other low level function in future using GDTC_INIT(_NO_WB) changes to
> need dtc->domain, we re-define GDTC_INIT_NO_WB to proper value.
> As this only looks confusing to me. I will drop this one in next version
> if you still prefer to keep definatino of GDTC_INIT_NO_WB in the old way.
Yeah, please keep the code as is for now. I agree this needs some cleanups
but what you suggest is IMHO not an improvement.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
on 4/2/2024 9:53 PM, Jan Kara wrote:
> On Thu 28-03-24 09:49:59, Kemeng Shi wrote:
>> on 3/27/2024 5:33 PM, Jan Kara wrote:
>>> On Thu 21-03-24 15:12:21, Kemeng Shi wrote:
>>>>
>>>>
>>>> on 3/20/2024 11:15 PM, Tejun Heo wrote:
>>>>> Hello,
>>>>>
>>>>> On Wed, Mar 20, 2024 at 07:02:22PM +0800, Kemeng Shi wrote:
>>>>>> We never use gdtc->dom set with GDTC_INIT_NO_WB, just remove unneeded
>>>>>> GDTC_INIT_NO_WB
>>>>>>
>>>>>> Signed-off-by: Kemeng Shi <[email protected]>
>>>>> ...
>>>>>> void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
>>>>>> {
>>>>>> - struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
>>>>>> + struct dirty_throttle_control gdtc = { };
>>>>>
>>>>> Even if it's currently not referenced, wouldn't it still be better to always
>>>>> guarantee that a dtc's dom is always initialized? I'm not sure what we get
>>>>> by removing this.
>>>> As we explicitly use GDTC_INIT_NO_WB to set global_wb_domain before
>>>> calculating dirty limit with domain_dirty_limits, I intuitively think the
>>>> dirty limit calculation in domain_dirty_limits is related to
>>>> global_wb_domain when CONFIG_WRITEBACK_CGROUP is enabled while the truth
>>>> is not. So this is a little confusing to me.
>>>
>> Hi Jan,
>>> I'm not sure I understand your confusion. domain_dirty_limits() calculates
>>> the dirty limit (and background dirty limit) for the dirty_throttle_control
>>> passed in. If you pass dtc initialized with GDTC_INIT[_NO_WB], it will
>>> compute global dirty limits. If the dtc passed in is initialized with
>>> MDTC_INIT() it will compute cgroup specific dirty limits.
>> No doubt about this.
>>>
>>> Now because domain_dirty_limits() does not scale the limits based on each
>>> device throughput - that is done only later in __wb_calc_thresh() to avoid> relatively expensive computations when we don't need them - and also
>>> because the effective dirty limit (dtc->dom->dirty_limit) is not updated by
>>> domain_dirty_limits(), domain_dirty_limits() does not need dtc->dom at all.
>> Acutally, here is the thing confusing me. For wb_calc_thresh, we always pass
>> dtc initialized with a wb (GDTC_INIT(wb) or MDTC_INIT(wb,..). The dtc
>> initialized with _NO_WB is only passed to domain_dirty_limits. However, The
>> dom initialized by _NO_WB for domain_dirty_limits is not needed at all.
>>> But that is a technical detail of implementation and I don't want this
>>> technical detail to be relied on by even more code.
>> Yes, I agree with this. So I wonder if it's acceptable to simply define
>> GDTC_INIT_NO_WB to empty for now instead of remove defination of
>> GDTC_INIT_NO_WB. When implementation of domain_dirty_limits() or any
>> other low level function in future using GDTC_INIT(_NO_WB) changes to
>> need dtc->domain, we re-define GDTC_INIT_NO_WB to proper value.
>> As this only looks confusing to me. I will drop this one in next version
>> if you still prefer to keep definatino of GDTC_INIT_NO_WB in the old way.
>
> Yeah, please keep the code as is for now. I agree this needs some cleanups
> but what you suggest is IMHO not an improvement.
Sure, will drop this in next version.
Thanks,
Kemeng
>
> Honza
>