by NeilBrown

[permalink] [raw]

Subject: [PATCH 2/2] Deprecate NR_UNSTABLE_NFS, use NR_WRITEBACK

Attachments:

signature.asc (847.00 B)

2020-04-02 04:59:51

After an NFS page has been written it is considered "unstable" until a
COMMIT request succeeds. If the COMMIT fails, the page will be
re-written.

These "unstable" pages are currently accounted as "reclaimable", either
in WB_RECLAIMABLE, or in NR_UNSTABLE_NFS which is included in a
'reclaimable' count. This might have made sense when sending the COMMIT
required a separate action by the VFS/MM (e.g. releasepage() used to
send a COMMIT). However now that all writes generated by ->writepages()
will automatically be followed by a COMMIT (since commit 919e3bd9a875
("NFS: Ensure we commit after writeback is complete")) it makes more
sense to treat them as writeback pages.

So this patch removes NR_UNSTABLE_NFS and accounts unstable pages in
NR_WRITEBACK and WB_WRITEBACK.

A particular effect of this change is that when
wb_check_background_flush() calls wb_over_bg_threshold(), the latter
will report 'true' a lot less often as the 'unstable' pages are no
longer considered 'dirty' (and there is nothing that writeback can do
about them anyway).

Currently wb_check_background_flush() will trigger writeback to NFS even
when there are relatively few dirty pages (if there are lots of unstable
pages), this can result in small writes going to the server (10s of
Kilobytes rather than a Megabyte) which hurts throughput.
With this patch, there are fewer writes which are each larger on average.

Signed-off-by: NeilBrown <[email protected]>
---

NR_UNSTABLE_NFS completely removed as recommended by Christoph, removal
of an unnecessary comment, and improvements to commit message.
Thanks.

Documentation/filesystems/proc.txt | 3 ---
drivers/base/node.c | 2 --
fs/fs-writeback.c | 1 -
fs/nfs/internal.h | 10 +++++++---
fs/nfs/write.c | 4 ++--
fs/proc/meminfo.c | 2 --
include/linux/mmzone.h | 1 -
include/trace/events/writeback.h | 5 +----
mm/memcontrol.c | 1 -
mm/page-writeback.c | 17 ++++-------------
mm/page_alloc.c | 5 +----
mm/vmstat.c | 1 -
12 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 99ca040e3f90..690c712f5f79 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -904,7 +904,6 @@ Slab: 284364 kB
SReclaimable: 159856 kB
SUnreclaim: 124508 kB
PageTables: 24448 kB
-NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 7669796 kB
@@ -975,8 +974,6 @@ SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
PageTables: amount of memory dedicated to the lowest level of page
tables.
-NFS_Unstable: NFS pages sent to the server, but not yet committed to stable
- storage
Bounce: Memory used for block device "bounce buffers"
WritebackTmp: Memory used by FUSE for temporary writeback buffers
CommitLimit: Based on the overcommit ratio ('vm.overcommit_ratio'),
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 98a31bafc8a2..7059021ce2af 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -416,7 +416,6 @@ static ssize_t node_read_meminfo(struct device *dev,
"Node %d Shmem: %8lu kB\n"
"Node %d KernelStack: %8lu kB\n"
"Node %d PageTables: %8lu kB\n"
- "Node %d NFS_Unstable: %8lu kB\n"
"Node %d Bounce: %8lu kB\n"
"Node %d WritebackTmp: %8lu kB\n"
"Node %d KReclaimable: %8lu kB\n"
@@ -439,7 +438,6 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(i.sharedram),
nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
- nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
nid, K(sreclaimable +
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76ac9c7d32ec..c5bdf46e3b4b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
static unsigned long get_nr_dirty_pages(void)
{
return global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS) +
get_nr_dirty_inodes();
}

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f80c47d5ff27..749da02b547a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -652,7 +652,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
}

/*
- * Record the page as unstable and mark its inode as dirty.
+ * Record the page as unstable (an extra writeback period) and mark its
+ * inode as dirty.
*/
static inline
void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
@@ -660,8 +661,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
if (!cinfo->dreq) {
struct inode *inode = page_file_mapping(page)->host;

- inc_node_page_state(page, NR_UNSTABLE_NFS);
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+ /* This page is really still in write-back - just that the
+ * writeback is happening on the server now.
+ */
+ inc_node_page_state(page, NR_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c478b772cc49..2e15a56620b3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -958,9 +958,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
static void
nfs_clear_page_commit(struct page *page)
{
- dec_node_page_state(page, NR_UNSTABLE_NFS);
+ dec_node_page_state(page, NR_WRITEBACK);
dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
- WB_RECLAIMABLE);
+ WB_WRITEBACK);
}

/* Called holding the request lock on @req */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8c1f1bb1a5ce..1378a132ff7e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -106,8 +106,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "PageTables: ",
global_zone_page_state(NR_PAGETABLE));

- show_val_kb(m, "NFS_Unstable: ",
- global_node_page_state(NR_UNSTABLE_NFS));
show_val_kb(m, "Bounce: ",
global_zone_page_state(NR_BOUNCE));
show_val_kb(m, "WritebackTmp: ",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 462f6873905a..a18611197bea 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -237,7 +237,6 @@ enum node_stat_item {
NR_FILE_THPS,
NR_FILE_PMDMAPPED,
NR_ANON_THPS,
- NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index d94def25e4dc..45b5fbdb1f62 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -542,7 +542,6 @@ TRACE_EVENT(global_dirty_state,
TP_STRUCT__entry(
__field(unsigned long, nr_dirty)
__field(unsigned long, nr_writeback)
- __field(unsigned long, nr_unstable)
__field(unsigned long, background_thresh)
__field(unsigned long, dirty_thresh)
__field(unsigned long, dirty_limit)
@@ -553,7 +552,6 @@ TRACE_EVENT(global_dirty_state,
TP_fast_assign(
__entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY);
__entry->nr_writeback = global_node_page_state(NR_WRITEBACK);
- __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS);
__entry->nr_dirtied = global_node_page_state(NR_DIRTIED);
__entry->nr_written = global_node_page_state(NR_WRITTEN);
__entry->background_thresh = background_thresh;
@@ -561,12 +559,11 @@ TRACE_EVENT(global_dirty_state,
__entry->dirty_limit = global_wb_domain.dirty_limit;
),

- TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+ TP_printk("dirty=%lu writeback=%lu "
"bg_thresh=%lu thresh=%lu limit=%lu "
"dirtied=%lu written=%lu",
__entry->nr_dirty,
__entry->nr_writeback,
- __entry->nr_unstable,
__entry->background_thresh,
__entry->dirty_thresh,
__entry->dirty_limit,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7ddf91c4295f..fad8e8a23235 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4317,7 +4317,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,

*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);

- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2afb09fa2fe0..dbc73522609e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -504,7 +504,6 @@ bool node_dirty_ok(struct pglist_data *pgdat)
unsigned long nr_pages = 0;

nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
- nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
nr_pages += node_page_state(pgdat, NR_WRITEBACK);

return nr_pages <= limit;
@@ -758,7 +757,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*
* Return: @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * dirty balancing includes all PG_dirty and PG_writeback pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1566,7 +1565,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long nr_reclaimable; /* = file_dirty */
long period;
long pause;
long max_pause;
@@ -1589,14 +1588,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;

- /*
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- */
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);

@@ -1940,8 +1932,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
domain_dirty_limits(gdtc);

if (gdtc->dirty > gdtc->bg_thresh)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..6bd1112d590d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5239,7 +5239,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)

printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
@@ -5252,7 +5252,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state(NR_UNSTABLE_NFS),
global_node_page_state(NR_SLAB_RECLAIMABLE),
global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
@@ -5285,7 +5284,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
- " unstable:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5307,7 +5305,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 78d53378db99..d1291537bbb9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = {
"nr_file_hugepages",
"nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
- "nr_unstable",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
--
2.26.0

Attachments:

signature.asc (847.00 B)

2020-04-03 09:42:57

by Jan Kara

[permalink] [raw]

Subject: Re: [PATCH 2/2 - v2] MM: Discard NR_UNSTABLE_NFS, use NR_WRITEBACK instead.

On Fri 03-04-20 09:35:21, NeilBrown wrote:
>
> After an NFS page has been written it is considered "unstable" until a
> COMMIT request succeeds. If the COMMIT fails, the page will be
> re-written.
>
> These "unstable" pages are currently accounted as "reclaimable", either
> in WB_RECLAIMABLE, or in NR_UNSTABLE_NFS which is included in a
> 'reclaimable' count. This might have made sense when sending the COMMIT
> required a separate action by the VFS/MM (e.g. releasepage() used to
> send a COMMIT). However now that all writes generated by ->writepages()
> will automatically be followed by a COMMIT (since commit 919e3bd9a875
> ("NFS: Ensure we commit after writeback is complete")) it makes more
> sense to treat them as writeback pages.
>
> So this patch removes NR_UNSTABLE_NFS and accounts unstable pages in
> NR_WRITEBACK and WB_WRITEBACK.
>
> A particular effect of this change is that when
> wb_check_background_flush() calls wb_over_bg_threshold(), the latter
> will report 'true' a lot less often as the 'unstable' pages are no
> longer considered 'dirty' (and there is nothing that writeback can do
> about them anyway).
>
> Currently wb_check_background_flush() will trigger writeback to NFS even
> when there are relatively few dirty pages (if there are lots of unstable
> pages), this can result in small writes going to the server (10s of
> Kilobytes rather than a Megabyte) which hurts throughput.
> With this patch, there are fewer writes which are each larger on average.
>
> Signed-off-by: NeilBrown <[email protected]>
> ---
>
> NR_UNSTABLE_NFS completely removed as recommended by Christoph, removal
> of an unnecessary comment, and improvements to commit message.
> Thanks.
>
> Documentation/filesystems/proc.txt | 3 ---
> drivers/base/node.c | 2 --
> fs/fs-writeback.c | 1 -
> fs/nfs/internal.h | 10 +++++++---
> fs/nfs/write.c | 4 ++--
> fs/proc/meminfo.c | 2 --
> include/linux/mmzone.h | 1 -
> include/trace/events/writeback.h | 5 +----
> mm/memcontrol.c | 1 -
> mm/page-writeback.c | 17 ++++-------------
> mm/page_alloc.c | 5 +----
> mm/vmstat.c | 1 -
> 12 files changed, 15 insertions(+), 37 deletions(-)
>
> diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
> index 99ca040e3f90..690c712f5f79 100644
> --- a/Documentation/filesystems/proc.txt
> +++ b/Documentation/filesystems/proc.txt
> @@ -904,7 +904,6 @@ Slab: 284364 kB
> SReclaimable: 159856 kB
> SUnreclaim: 124508 kB
> PageTables: 24448 kB
> -NFS_Unstable: 0 kB
> Bounce: 0 kB
> WritebackTmp: 0 kB
> CommitLimit: 7669796 kB
> @@ -975,8 +974,6 @@ SReclaimable: Part of Slab, that might be reclaimed, such as caches
> SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
> PageTables: amount of memory dedicated to the lowest level of page
> tables.
> -NFS_Unstable: NFS pages sent to the server, but not yet committed to stable
> - storage
> Bounce: Memory used for block device "bounce buffers"
> WritebackTmp: Memory used by FUSE for temporary writeback buffers
> CommitLimit: Based on the overcommit ratio ('vm.overcommit_ratio'),
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index 98a31bafc8a2..7059021ce2af 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -416,7 +416,6 @@ static ssize_t node_read_meminfo(struct device *dev,
> "Node %d Shmem: %8lu kB\n"
> "Node %d KernelStack: %8lu kB\n"
> "Node %d PageTables: %8lu kB\n"
> - "Node %d NFS_Unstable: %8lu kB\n"
> "Node %d Bounce: %8lu kB\n"
> "Node %d WritebackTmp: %8lu kB\n"
> "Node %d KReclaimable: %8lu kB\n"
> @@ -439,7 +438,6 @@ static ssize_t node_read_meminfo(struct device *dev,
> nid, K(i.sharedram),
> nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
> nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
> - nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
> nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
> nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
> nid, K(sreclaimable +

So I don't think we can just remove lines from procfs files like this. That
has a high potential of breaking some userspace app that is not careful
enough when parsing the file. So I think that we need to leave there the
format string and just replace K(node_page_state(pgdat, NR_UNSTABLE_NFS))
with 0.

> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 8c1f1bb1a5ce..1378a132ff7e 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -106,8 +106,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "PageTables: ",
> global_zone_page_state(NR_PAGETABLE));
>
> - show_val_kb(m, "NFS_Unstable: ",
> - global_node_page_state(NR_UNSTABLE_NFS));
> show_val_kb(m, "Bounce: ",
> global_zone_page_state(NR_BOUNCE));
> show_val_kb(m, "WritebackTmp: ",

Similarly here.

> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 78d53378db99..d1291537bbb9 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = {
> "nr_file_hugepages",
> "nr_file_pmdmapped",
> "nr_anon_transparent_hugepages",
> - "nr_unstable",
> "nr_vmscan_write",
> "nr_vmscan_immediate_reclaim",
> "nr_dirtied",

This is probably the most tricky to deal with given how /proc/vmstat is
formatted. OTOH for this file there's good chance we'd get away with just
deleting nr_unstable line because there are entries added to it in the
middle (e.g. in 60fbf0ab5da1 last September) and nobody complained yet.

What do mm people think? How were changes to vmstat counters handled in the
past?

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2020-04-03 11:05:09

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH 2/2 - v2] MM: Discard NR_UNSTABLE_NFS, use NR_WRITEBACK instead.

On Fri 03-04-20 11:42:20, Jan Kara wrote:
[...]
> > diff --git a/mm/vmstat.c b/mm/vmstat.c
> > index 78d53378db99..d1291537bbb9 100644
> > --- a/mm/vmstat.c
> > +++ b/mm/vmstat.c
> > @@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = {
> > "nr_file_hugepages",
> > "nr_file_pmdmapped",
> > "nr_anon_transparent_hugepages",
> > - "nr_unstable",
> > "nr_vmscan_write",
> > "nr_vmscan_immediate_reclaim",
> > "nr_dirtied",
>
> This is probably the most tricky to deal with given how /proc/vmstat is
> formatted. OTOH for this file there's good chance we'd get away with just
> deleting nr_unstable line because there are entries added to it in the
> middle (e.g. in 60fbf0ab5da1 last September) and nobody complained yet.
>
> What do mm people think? How were changes to vmstat counters handled in the
> past?

Adding new counters in the middle seems to be generally OK. I would be
more worried about removing counters though. So if we can simply print a
phone value at the very end then this should be a reasonable workaround.
--
Michal Hocko
SUSE Labs

2020-04-03 15:17:08

by Michal Hocko

[permalink] [raw]

Subject: Re: [PATCH 1/2] MM: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE

2020-04-06 00:14:45

On Tue 07-04-20 09:28:19, Neil Brown wrote:
> On Fri, Apr 03 2020, Jan Kara wrote:
> >
> > So I don't think we can just remove lines from procfs files like this. That
> > has a high potential of breaking some userspace app that is not careful
> > enough when parsing the file. So I think that we need to leave there the
> > format string and just replace K(node_page_state(pgdat, NR_UNSTABLE_NFS))
> > with 0.
>
> OK. I assume changing the static trace points isn't a problem though?

It shouldn't be until we learn that somebody depends on it...
--
Michal Hocko
SUSE Labs

2020-04-07 16:11:03

by Chuck Lever

[permalink] [raw]

Subject: Re: [PATCH 1/2] MM: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE

> On Apr 6, 2020, at 7:43 PM, NeilBrown <[email protected]> wrote:
>
>
> PF_LESS_THROTTLE exists for loop-back nfsd, and a similar need in the
> loop block driver, where a daemon needs to write to one bdi (the final
> bdi) in order to free up writes queued to another bdi (the client bdi).
>
> The daemon sets PF_LESS_THROTTLE and gets a larger allowance of dirty
> pages, so that it can still dirty pages after other processses have been
> throttled.
>
> This approach was designed when all threads were blocked equally,
> independently on which device they were writing to, or how fast it was.
> Since that time the writeback algorithm has changed substantially with
> different threads getting different allowances based on non-trivial
> heuristics. This means the simple "add 25%" heuristic is no longer
> reliable.
>
> The important issue is not that the daemon needs a *larger* dirty page
> allowance, but that it needs a *private* dirty page allowance, so that
> dirty pages for the "client" bdi that it is helping to clear (the bdi for
> an NFS filesystem or loop block device etc) do not affect the throttling
> of the deamon writing to the "final" bdi.
>
> This patch changes the heuristic to ignore the global limits and
> consider only the limit relevant to the bdi being written to. This
> approach is already available for BDI_CAP_STRICTLIMIT users (fuse) and
> should not introduce surprises. This has the desired result of
> protecting the task from the consequences of large amounts of dirty data
> queued for other devices.
>
> This approach of "only consider the target bdi" is consistent with the
> other use of PF_LESS_THROTTLE in current_may_throttle(), were it causes
> attention to be focussed only on the target bdi.
>
> So this patch
> - renames PF_LESS_THROTTLE to PF_LOCAL_THROTTLE,
> - removes the 25% bonus that that flag gives, and
> - imposes 'strictlimit' handling for any process with PF_LOCAL_THROTTLE
> set.
>
> Note that previously realtime threads were treated the same as
> PF_LESS_THROTTLE threads. This patch does *not* change the behvaiour for
> real-time threads, so it is now different from the behaviour of nfsd and
> loop tasks. I don't know what is wanted for realtime.
>
> Note that the worst-case situation with this patch is that the threshold
> might be calculated as zero. In that case the daemon may block when
> there are any dirty pages for the final bdi. These will eventually
> clear and the daemon will be able to proceed. The writing of those
> dirty pages will increase the apparent throughput of the final bdi and
> thus increase its threshold for future calculations.
>
> Reviewed-by: Jan Kara <[email protected]>
> Signed-off-by: NeilBrown <[email protected]>
> ---
> drivers/block/loop.c | 2 +-
> fs/nfsd/vfs.c | 9 +++++----
> include/linux/sched.h | 3 ++-
> kernel/sys.c | 2 +-
> mm/page-writeback.c | 10 ++++++----
> mm/vmscan.c | 4 ++--
> 6 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index a42c49e04954..0e13b9fc8dfa 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -899,7 +899,7 @@ static void loop_unprepare_queue(struct loop_device *lo)
>
> static int loop_kthread_worker_fn(void *worker_ptr)
> {
> - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
> + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
> return kthread_worker_fn(worker_ptr);
> }
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 0aa02eb18bd3..c3fbab1753ec 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,

Assuming these patches are not going through the NFSD tree, so this hunk is

Acked-by: Chuck Lever <[email protected]>

If this isn't necessary or appropriate, then ignore me :-)

> if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
> /*
> - * We want less throttling in balance_dirty_pages()
> - * and shrink_inactive_list() so that nfs to
> + * We want throttling in balance_dirty_pages()
> + * and shrink_inactive_list() to only consider
> + * the backingdev we are writing to, so that nfs to
> * localhost doesn't cause nfsd to lock up due to all
> * the client's dirty pages or its congested queue.
> */
> - current->flags |= PF_LESS_THROTTLE;
> + current->flags |= PF_LOCAL_THROTTLE;
>
> exp = fhp->fh_export;
> use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
> @@ -1037,7 +1038,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
> nfserr = nfserrno(host_err);
> }
> if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
> - current_restore_flags(pflags, PF_LESS_THROTTLE);
> + current_restore_flags(pflags, PF_LOCAL_THROTTLE);
> return nfserr;
> }
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4418f5cb8324..5955a089df32 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1481,7 +1481,8 @@ extern struct pid *cad_pid;
> #define PF_KSWAPD 0x00020000 /* I am kswapd */
> #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
> #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
> -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
> +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only agasint the bdi I write to,
> + * I am cleaning dirty pages from some other bdi. */
> #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
> #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
> #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
> diff --git a/kernel/sys.c b/kernel/sys.c
> index d325f3ab624a..180a2fa33f7f 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
> return -EINVAL;
> }
>
> -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE)
> +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
>
> SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> unsigned long, arg4, unsigned long, arg5)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 7326b54ab728..4c9875971de5 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void)
> * Calculate @dtc->thresh and ->bg_thresh considering
> * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
> * must ensure that @dtc->avail is set before calling this function. The
> - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
> - * real-time tasks.
> + * dirty limits will be lifted by 1/4 for real-time tasks.
> */
> static void domain_dirty_limits(struct dirty_throttle_control *dtc)
> {
> @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
> if (bg_thresh >= thresh)
> bg_thresh = thresh / 2;
> tsk = current;
> - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
> + if (rt_task(tsk)) {
> bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
> thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
> }
> @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
> else
> dirty = vm_dirty_ratio * node_memory / 100;
>
> - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
> + if (rt_task(tsk))
> dirty += dirty / 4;
>
> return dirty;
> @@ -1580,6 +1579,9 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
> bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
> unsigned long start_time = jiffies;
>
> + if (current->flags & PF_LOCAL_THROTTLE)
> + /* This task must only be throttled by its own writeback */
> + strictlimit = true;
> for (;;) {
> unsigned long now = jiffies;
> unsigned long dirty, thresh, bg_thresh;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 2e8e690d2813..b776da4bb8c8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1879,13 +1879,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
>
> /*
> * If a kernel thread (such as nfsd for loop-back mounts) services
> - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
> + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
> * In that case we should only throttle if the backing device it is
> * writing to is congested. In other cases it is safe to throttle.
> */
> static int current_may_throttle(void)
> {
> - return !(current->flags & PF_LESS_THROTTLE) ||
> + return !(current->flags & PF_LOCAL_THROTTLE) ||
> current->backing_dev_info == NULL ||
> bdi_write_congested(current->backing_dev_info);
> }
> --
> 2.26.0
>

--
Chuck Lever
[email protected]