2020-06-25 11:32:58

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 0/6] Overhaul memalloc_no*

I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
for an upcoming patch series, and Jens also wants it for non-blocking
io_uring. It turns out we already have dm-bufio which could benefit
from memalloc_nowait, so it may as well go into the tree now.

The biggest problem is that we're basically out of PF_ flags, so we need
to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
out the PF_ flags are really supposed to be used for flags which are
accessed from other tasks, and the MEMALLOC flags are only going to
be used by this task. So shuffling everything around frees up some PF
flags and generally makes the world a better place.

Patch series also available from
http://git.infradead.org/users/willy/linux.git/shortlog/refs/heads/memalloc

Matthew Wilcox (Oracle) (6):
mm: Replace PF_MEMALLOC_NOIO with memalloc_noio
mm: Add become_kswapd and restore_kswapd
xfs: Convert to memalloc_nofs_save
mm: Replace PF_MEMALLOC_NOFS with memalloc_nofs
mm: Replace PF_MEMALLOC_NOIO with memalloc_nocma
mm: Add memalloc_nowait

drivers/block/loop.c | 3 +-
drivers/md/dm-bufio.c | 30 ++++--------
drivers/md/dm-zoned-metadata.c | 5 +-
fs/iomap/buffered-io.c | 2 +-
fs/xfs/kmem.c | 2 +-
fs/xfs/libxfs/xfs_btree.c | 14 +++---
fs/xfs/xfs_aops.c | 4 +-
fs/xfs/xfs_buf.c | 2 +-
fs/xfs/xfs_linux.h | 6 ---
fs/xfs/xfs_trans.c | 14 +++---
fs/xfs/xfs_trans.h | 2 +-
include/linux/sched.h | 7 +--
include/linux/sched/mm.h | 84 ++++++++++++++++++++++++++--------
kernel/sys.c | 8 ++--
mm/vmscan.c | 16 +------
15 files changed, 105 insertions(+), 94 deletions(-)

--
2.27.0


2020-06-25 11:33:45

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 5/6] mm: Replace PF_MEMALLOC_NOIO with memalloc_nocma

We're short on PF_* flags, so make memalloc_nocma its own bit where we
have plenty of space.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
include/linux/sched.h | 2 +-
include/linux/sched/mm.h | 15 +++++++--------
2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf36ae1fde2..90336850e940 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -803,6 +803,7 @@ struct task_struct {
#endif
unsigned memalloc_noio:1;
unsigned memalloc_nofs:1;
+ unsigned memalloc_nocma:1;

unsigned long atomic_flags; /* Flags requiring atomic access. */

@@ -1514,7 +1515,6 @@ extern struct pid *cad_pid;
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
-#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 08bc9d0606a8..6f7b59a848a6 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -175,12 +175,11 @@ static inline bool in_vfork(struct task_struct *tsk)

/*
* Applies per-task gfp context to the given allocation flags.
- * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
- if (unlikely((current->flags & PF_MEMALLOC_NOCMA) ||
- current->memalloc_noio || current->memalloc_nofs)) {
+ if (unlikely(current->memalloc_noio || current->memalloc_nofs ||
+ current->memalloc_nocma)) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
@@ -190,7 +189,8 @@ static inline gfp_t current_gfp_context(gfp_t flags)
else if (current->memalloc_nofs)
flags &= ~__GFP_FS;
#ifdef CONFIG_CMA
- if (current->flags & PF_MEMALLOC_NOCMA)
+ /* memalloc_nocma implies no allocation from movable region */
+ if (current->memalloc_nocma)
flags &= ~__GFP_MOVABLE;
#endif
}
@@ -286,15 +286,14 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
#ifdef CONFIG_CMA
static inline unsigned int memalloc_nocma_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
-
- current->flags |= PF_MEMALLOC_NOCMA;
+ unsigned int flags = current->memalloc_nocma;
+ current->memalloc_nocma = 1;
return flags;
}

static inline void memalloc_nocma_restore(unsigned int flags)
{
- current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
+ current->memalloc_nocma = flags ? 1 : 0;
}
#else
static inline unsigned int memalloc_nocma_save(void)
--
2.27.0

2020-06-25 11:34:18

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 2/6] mm: Add become_kswapd and restore_kswapd

Since XFS needs to pretend to be kswapd in some of its worker threads,
create methods to save & restore kswapd state. Don't bother restoring
kswapd state in kswapd -- the only time we reach this code is when we're
exiting and the task_struct is about to be destroyed anyway.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/xfs/libxfs/xfs_btree.c | 14 ++++++++------
include/linux/sched/mm.h | 26 ++++++++++++++++++++++++++
mm/vmscan.c | 16 +---------------
3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2d25bab68764..a04a44238aab 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2813,8 +2813,9 @@ xfs_btree_split_worker(
{
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
+ bool is_kswapd = args->kswapd;
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ int memalloc_nofs;

/*
* we are in a transaction context here, but may also be doing work
@@ -2822,16 +2823,17 @@ xfs_btree_split_worker(
* temporarily to ensure that we don't block waiting for memory reclaim
* in any way.
*/
- if (args->kswapd)
- new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
-
- current_set_flags_nested(&pflags, new_pflags);
+ if (is_kswapd)
+ pflags = become_kswapd();
+ memalloc_nofs = memalloc_nofs_save();

args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
complete(args->done);

- current_restore_flags_nested(&pflags, new_pflags);
+ memalloc_nofs_restore(memalloc_nofs);
+ if (is_kswapd)
+ restore_kswapd(pflags);
}

/*
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 1a7e1ab1be85..b0089eadc367 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -308,6 +308,32 @@ static inline void memalloc_nocma_restore(unsigned int flags)
}
#endif

+/*
+ * Tell the memory management that we're a "memory allocator",
+ * and that if we need more memory we should get access to it
+ * regardless (see "__alloc_pages()"). "kswapd" should
+ * never get caught in the normal page freeing logic.
+ *
+ * (Kswapd normally doesn't need memory anyway, but sometimes
+ * you need a small amount of memory in order to be able to
+ * page out something else, and this flag essentially protects
+ * us from recursively trying to free more memory as we're
+ * trying to free the first piece of memory in the first place).
+ */
+#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD)
+
+static inline unsigned long become_kswapd(void)
+{
+ unsigned long flags = current->flags & KSWAPD_PF_FLAGS;
+ current->flags |= KSWAPD_PF_FLAGS;
+ return flags;
+}
+
+static inline void restore_kswapd(unsigned long flags)
+{
+ current->flags &= ~(flags ^ KSWAPD_PF_FLAGS);
+}
+
static inline void set_current_io_flusher(void)
{
current->flags |= PF_LOCAL_THROTTLE;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6d84326bdf2..27ae76699899 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3870,19 +3870,7 @@ static int kswapd(void *p)
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);

- /*
- * Tell the memory management that we're a "memory allocator",
- * and that if we need more memory we should get access to it
- * regardless (see "__alloc_pages()"). "kswapd" should
- * never get caught in the normal page freeing logic.
- *
- * (Kswapd normally doesn't need memory anyway, but sometimes
- * you need a small amount of memory in order to be able to
- * page out something else, and this flag essentially protects
- * us from recursively trying to free more memory as we're
- * trying to free the first piece of memory in the first place).
- */
- tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ become_kswapd();
set_freezable();

WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -3932,8 +3920,6 @@ static int kswapd(void *p)
goto kswapd_try_sleep;
}

- tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
-
return 0;
}

--
2.27.0

2020-06-25 11:34:31

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 4/6] mm: Replace PF_MEMALLOC_NOFS with memalloc_nofs

We're short on PF_* flags, so make memalloc_nofs its own bit where we
have plenty of space.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/iomap/buffered-io.c | 2 +-
include/linux/sched.h | 2 +-
include/linux/sched/mm.h | 13 ++++++-------
3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bcfc288dba3f..87d66c13bf5c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1502,7 +1502,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
* Given that we do not allow direct reclaim to call us, we should
* never be called in a recursive filesystem reclaim context.
*/
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
+ if (WARN_ON_ONCE(current->memalloc_nofs))
goto redirty;

/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cf18a3d2bc4c..eaf36ae1fde2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -802,6 +802,7 @@ struct task_struct {
unsigned in_memstall:1;
#endif
unsigned memalloc_noio:1;
+ unsigned memalloc_nofs:1;

unsigned long atomic_flags; /* Flags requiring atomic access. */

@@ -1505,7 +1506,6 @@ extern struct pid *cad_pid;
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF_FROZEN 0x00010000 /* Frozen for system suspend */
#define PF_KSWAPD 0x00020000 /* I am kswapd */
-#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b0089eadc367..08bc9d0606a8 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -175,20 +175,19 @@ static inline bool in_vfork(struct task_struct *tsk)

/*
* Applies per-task gfp context to the given allocation flags.
- * PF_MEMALLOC_NOFS implies GFP_NOFS
* PF_MEMALLOC_NOCMA implies no allocation from CMA region.
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
- if (unlikely(current->flags & (PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA) ||
- current->memalloc_noio)) {
+ if (unlikely((current->flags & PF_MEMALLOC_NOCMA) ||
+ current->memalloc_noio || current->memalloc_nofs)) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
*/
if (current->memalloc_noio)
flags &= ~(__GFP_IO | __GFP_FS);
- else if (current->flags & PF_MEMALLOC_NOFS)
+ else if (current->memalloc_nofs)
flags &= ~__GFP_FS;
#ifdef CONFIG_CMA
if (current->flags & PF_MEMALLOC_NOCMA)
@@ -254,8 +253,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
*/
static inline unsigned int memalloc_nofs_save(void)
{
- unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
- current->flags |= PF_MEMALLOC_NOFS;
+ unsigned int flags = current->memalloc_nofs;
+ current->memalloc_nofs = 1;
return flags;
}

@@ -269,7 +268,7 @@ static inline unsigned int memalloc_nofs_save(void)
*/
static inline void memalloc_nofs_restore(unsigned int flags)
{
- current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+ current->memalloc_nofs = flags ? 1 : 0;
}

static inline unsigned int memalloc_noreclaim_save(void)
--
2.27.0

2020-06-25 11:35:04

by Matthew Wilcox

[permalink] [raw]
Subject: [PATCH 3/6] xfs: Convert to memalloc_nofs_save

Instead of using custom macros to set/restore PF_MEMALLOC_NOFS, use
memalloc_nofs_save() like the rest of the kernel.

Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>
---
fs/xfs/kmem.c | 2 +-
fs/xfs/xfs_aops.c | 4 ++--
fs/xfs/xfs_buf.c | 2 +-
fs/xfs/xfs_linux.h | 6 ------
fs/xfs/xfs_trans.c | 14 +++++++-------
fs/xfs/xfs_trans.h | 2 +-
6 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index f1366475c389..c2d237159bfc 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -35,7 +35,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
* __vmalloc() will allocate data pages and auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
* we need to tell memory reclaim that we are in such a context via
- * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * memalloc_nofs to prevent memory reclaim re-entering the filesystem here
* and potentially deadlocking.
*/
static void *
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index b35611882ff9..e3a4806e519d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc(
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);
return 0;
}

@@ -125,7 +125,7 @@ xfs_setfilesize_ioend(
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ tp->t_memalloc = memalloc_nofs_save();
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);

/* we abort the update if there was an IO error */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 20b748f7e186..b2c3d01c690b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -470,7 +470,7 @@ _xfs_buf_map_pages(
* vm_map_ram() will allocate auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
- * that we are in such a context via PF_MEMALLOC_NOFS to prevent
+ * that we are in such a context via memalloc_nofs to prevent
* memory reclaim re-entering the filesystem here and
* potentially deadlocking.
*/
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 9f70d2f68e05..e1daf242a53b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -104,12 +104,6 @@ typedef __u32 xfs_nlink_t;
#define current_cpu() (raw_smp_processor_id())
#define current_pid() (current->pid)
#define current_test_flags(f) (current->flags & (f))
-#define current_set_flags_nested(sp, f) \
- (*(sp) = current->flags, current->flags |= (f))
-#define current_clear_flags_nested(sp, f) \
- (*(sp) = current->flags, current->flags &= ~(f))
-#define current_restore_flags_nested(sp, f) \
- (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))

#define NBBY 8 /* number of bits per byte */

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 3c94e5ff4316..4ef1a0ff0a11 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -118,7 +118,7 @@ xfs_trans_dup(

ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+ ntp->t_memalloc = tp->t_memalloc;

/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
@@ -153,7 +153,7 @@ xfs_trans_reserve(
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;

/* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ tp->t_memalloc = memalloc_nofs_save();

/*
* Attempt to reserve the needed disk blocks by decrementing
@@ -163,7 +163,7 @@ xfs_trans_reserve(
if (blocks > 0) {
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);
return -ENOSPC;
}
tp->t_blk_res += blocks;
@@ -240,7 +240,7 @@ xfs_trans_reserve(
tp->t_blk_res = 0;
}

- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);

return error;
}
@@ -861,7 +861,7 @@ __xfs_trans_commit(

xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);

- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);
xfs_trans_free(tp);

/*
@@ -893,7 +893,7 @@ __xfs_trans_commit(
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);

@@ -954,7 +954,7 @@ xfs_trans_cancel(
}

/* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ memalloc_nofs_restore(tp->t_memalloc);

xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8308bf6d7e40..7aa2d5ff9245 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -118,6 +118,7 @@ typedef struct xfs_trans {
unsigned int t_rtx_res; /* # of rt extents resvd */
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
unsigned int t_flags; /* misc flags */
+ unsigned int t_memalloc; /* saved memalloc state */
xfs_fsblock_t t_firstblock; /* first block allocated */
struct xlog_ticket *t_ticket; /* log mgr ticket */
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
@@ -144,7 +145,6 @@ typedef struct xfs_trans {
struct list_head t_items; /* log item descriptors */
struct list_head t_busy; /* list of busy extents */
struct list_head t_dfops; /* deferred operations */
- unsigned long t_pflags; /* saved process flags state */
} xfs_trans_t;

/*
--
2.27.0

2020-06-25 12:32:55

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 2/6] mm: Add become_kswapd and restore_kswapd

On Thu 25-06-20 12:31:18, Matthew Wilcox wrote:
> Since XFS needs to pretend to be kswapd in some of its worker threads,
> create methods to save & restore kswapd state. Don't bother restoring
> kswapd state in kswapd -- the only time we reach this code is when we're
> exiting and the task_struct is about to be destroyed anyway.
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>

Certainly better than an opencoded PF_$FOO manipulation
Acked-by: Michal Hocko <[email protected]>

I would just ask for a clarification because this is rellying to have a
good MM knowledge to follow

> +/*
> + * Tell the memory management that we're a "memory allocator",

I would go with.
Tell the memory management that the caller is working on behalf of the
background memory reclaim (aka kswapd) and help it to make a forward
progress. That means that it will get an access to memory reserves
should there be a need to allocate memory in order to make a forward
progress. Note that the caller has to be extremely careful when doing
that.

Or something like that.

> + * and that if we need more memory we should get access to it
> + * regardless (see "__alloc_pages()"). "kswapd" should
> + * never get caught in the normal page freeing logic.
> + *
> + * (Kswapd normally doesn't need memory anyway, but sometimes
> + * you need a small amount of memory in order to be able to
> + * page out something else, and this flag essentially protects
> + * us from recursively trying to free more memory as we're
> + * trying to free the first piece of memory in the first place).
> + */
> +#define KSWAPD_PF_FLAGS (PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD)
> +
> +static inline unsigned long become_kswapd(void)
> +{
> + unsigned long flags = current->flags & KSWAPD_PF_FLAGS;
> + current->flags |= KSWAPD_PF_FLAGS;
> + return flags;
> +}
> +
> +static inline void restore_kswapd(unsigned long flags)
> +{
> + current->flags &= ~(flags ^ KSWAPD_PF_FLAGS);
> +}
> +
> static inline void set_current_io_flusher(void)
> {
> current->flags |= PF_LOCAL_THROTTLE;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index b6d84326bdf2..27ae76699899 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3870,19 +3870,7 @@ static int kswapd(void *p)
> if (!cpumask_empty(cpumask))
> set_cpus_allowed_ptr(tsk, cpumask);
>
> - /*
> - * Tell the memory management that we're a "memory allocator",
> - * and that if we need more memory we should get access to it
> - * regardless (see "__alloc_pages()"). "kswapd" should
> - * never get caught in the normal page freeing logic.
> - *
> - * (Kswapd normally doesn't need memory anyway, but sometimes
> - * you need a small amount of memory in order to be able to
> - * page out something else, and this flag essentially protects
> - * us from recursively trying to free more memory as we're
> - * trying to free the first piece of memory in the first place).
> - */
> - tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
> + become_kswapd();
> set_freezable();
>
> WRITE_ONCE(pgdat->kswapd_order, 0);
> @@ -3932,8 +3920,6 @@ static int kswapd(void *p)
> goto kswapd_try_sleep;
> }
>
> - tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
> -
> return 0;
> }
>
> --
> 2.27.0
>

--
Michal Hocko
SUSE Labs

2020-06-25 13:36:37

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 4/6] mm: Replace PF_MEMALLOC_NOFS with memalloc_nofs

On Thu 25-06-20 12:31:20, Matthew Wilcox wrote:
> We're short on PF_* flags, so make memalloc_nofs its own bit where we
> have plenty of space.
>
> Signed-off-by: Matthew Wilcox (Oracle) <[email protected]>

forgot to add
Acked-by: Michal Hocko <[email protected]>

> ---
> fs/iomap/buffered-io.c | 2 +-
> include/linux/sched.h | 2 +-
> include/linux/sched/mm.h | 13 ++++++-------
> 3 files changed, 8 insertions(+), 9 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index bcfc288dba3f..87d66c13bf5c 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1502,7 +1502,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
> * Given that we do not allow direct reclaim to call us, we should
> * never be called in a recursive filesystem reclaim context.
> */
> - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
> + if (WARN_ON_ONCE(current->memalloc_nofs))
> goto redirty;
>
> /*
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index cf18a3d2bc4c..eaf36ae1fde2 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -802,6 +802,7 @@ struct task_struct {
> unsigned in_memstall:1;
> #endif
> unsigned memalloc_noio:1;
> + unsigned memalloc_nofs:1;
>
> unsigned long atomic_flags; /* Flags requiring atomic access. */
>
> @@ -1505,7 +1506,6 @@ extern struct pid *cad_pid;
> #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
> #define PF_FROZEN 0x00010000 /* Frozen for system suspend */
> #define PF_KSWAPD 0x00020000 /* I am kswapd */
> -#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
> #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
> * I am cleaning dirty pages from some other bdi. */
> #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index b0089eadc367..08bc9d0606a8 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -175,20 +175,19 @@ static inline bool in_vfork(struct task_struct *tsk)
>
> /*
> * Applies per-task gfp context to the given allocation flags.
> - * PF_MEMALLOC_NOFS implies GFP_NOFS
> * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
> */
> static inline gfp_t current_gfp_context(gfp_t flags)
> {
> - if (unlikely(current->flags & (PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA) ||
> - current->memalloc_noio)) {
> + if (unlikely((current->flags & PF_MEMALLOC_NOCMA) ||
> + current->memalloc_noio || current->memalloc_nofs)) {
> /*
> * NOIO implies both NOIO and NOFS and it is a weaker context
> * so always make sure it makes precedence
> */
> if (current->memalloc_noio)
> flags &= ~(__GFP_IO | __GFP_FS);
> - else if (current->flags & PF_MEMALLOC_NOFS)
> + else if (current->memalloc_nofs)
> flags &= ~__GFP_FS;
> #ifdef CONFIG_CMA
> if (current->flags & PF_MEMALLOC_NOCMA)
> @@ -254,8 +253,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
> */
> static inline unsigned int memalloc_nofs_save(void)
> {
> - unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
> - current->flags |= PF_MEMALLOC_NOFS;
> + unsigned int flags = current->memalloc_nofs;
> + current->memalloc_nofs = 1;
> return flags;
> }
>
> @@ -269,7 +268,7 @@ static inline unsigned int memalloc_nofs_save(void)
> */
> static inline void memalloc_nofs_restore(unsigned int flags)
> {
> - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
> + current->memalloc_nofs = flags ? 1 : 0;
> }
>
> static inline unsigned int memalloc_noreclaim_save(void)
> --
> 2.27.0

--
Michal Hocko
SUSE Labs

2020-06-25 19:17:43

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Thu, Jun 25, 2020 at 12:31:16PM +0100, Matthew Wilcox (Oracle) wrote:
> I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
> for an upcoming patch series, and Jens also wants it for non-blocking
> io_uring. It turns out we already have dm-bufio which could benefit
> from memalloc_nowait, so it may as well go into the tree now.
>
> The biggest problem is that we're basically out of PF_ flags, so we need
> to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
> out the PF_ flags are really supposed to be used for flags which are
> accessed from other tasks, and the MEMALLOC flags are only going to
> be used by this task. So shuffling everything around frees up some PF
> flags and generally makes the world a better place.

So, uh, how does this intersect with the patch "xfs: reintroduce
PF_FSTRANS for transaction reservation recursion protection" that
re-adds PF_TRANS because uh I guess we lost some subtlety or another at
some point?

I don't have any strong opinion on this series one way or another, but
seeing as that patch was generating discussion about how PF_MEMALLOC_NO*
is not quite the same as PF_TRANS, I kinda want all these competing PF
tweaks and reworks to come together into a single series to review,
rather than the four(?) different people submitting conflicting changes.

[adding Yafang Shao to cc]

--D

> Patch series also available from
> http://git.infradead.org/users/willy/linux.git/shortlog/refs/heads/memalloc
>
> Matthew Wilcox (Oracle) (6):
> mm: Replace PF_MEMALLOC_NOIO with memalloc_noio
> mm: Add become_kswapd and restore_kswapd
> xfs: Convert to memalloc_nofs_save
> mm: Replace PF_MEMALLOC_NOFS with memalloc_nofs
> mm: Replace PF_MEMALLOC_NOIO with memalloc_nocma
> mm: Add memalloc_nowait
>
> drivers/block/loop.c | 3 +-
> drivers/md/dm-bufio.c | 30 ++++--------
> drivers/md/dm-zoned-metadata.c | 5 +-
> fs/iomap/buffered-io.c | 2 +-
> fs/xfs/kmem.c | 2 +-
> fs/xfs/libxfs/xfs_btree.c | 14 +++---
> fs/xfs/xfs_aops.c | 4 +-
> fs/xfs/xfs_buf.c | 2 +-
> fs/xfs/xfs_linux.h | 6 ---
> fs/xfs/xfs_trans.c | 14 +++---
> fs/xfs/xfs_trans.h | 2 +-
> include/linux/sched.h | 7 +--
> include/linux/sched/mm.h | 84 ++++++++++++++++++++++++++--------
> kernel/sys.c | 8 ++--
> mm/vmscan.c | 16 +------
> 15 files changed, 105 insertions(+), 94 deletions(-)
>
> --
> 2.27.0
>

2020-06-25 23:33:15

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Thu, Jun 25, 2020 at 11:48:32AM -0700, Darrick J. Wong wrote:
> On Thu, Jun 25, 2020 at 12:31:16PM +0100, Matthew Wilcox (Oracle) wrote:
> > I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
> > for an upcoming patch series, and Jens also wants it for non-blocking
> > io_uring. It turns out we already have dm-bufio which could benefit
> > from memalloc_nowait, so it may as well go into the tree now.
> >
> > The biggest problem is that we're basically out of PF_ flags, so we need
> > to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
> > out the PF_ flags are really supposed to be used for flags which are
> > accessed from other tasks, and the MEMALLOC flags are only going to
> > be used by this task. So shuffling everything around frees up some PF
> > flags and generally makes the world a better place.
>
> So, uh, how does this intersect with the patch "xfs: reintroduce
> PF_FSTRANS for transaction reservation recursion protection" that
> re-adds PF_TRANS because uh I guess we lost some subtlety or another at
> some point?

I don't know. I read that thread, but I couldn't follow the argument.

2020-06-25 23:33:24

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Thu 25-06-20 11:48:32, Darrick J. Wong wrote:
> On Thu, Jun 25, 2020 at 12:31:16PM +0100, Matthew Wilcox (Oracle) wrote:
> > I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
> > for an upcoming patch series, and Jens also wants it for non-blocking
> > io_uring. It turns out we already have dm-bufio which could benefit
> > from memalloc_nowait, so it may as well go into the tree now.
> >
> > The biggest problem is that we're basically out of PF_ flags, so we need
> > to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
> > out the PF_ flags are really supposed to be used for flags which are
> > accessed from other tasks, and the MEMALLOC flags are only going to
> > be used by this task. So shuffling everything around frees up some PF
> > flags and generally makes the world a better place.
>
> So, uh, how does this intersect with the patch "xfs: reintroduce
> PF_FSTRANS for transaction reservation recursion protection" that
> re-adds PF_TRANS because uh I guess we lost some subtlety or another at
> some point?

This is independent, really. It just relocates the NOFS flag. PF_TRANS
is reintroduced for a different reason. When I have replaced the
original PF_TRANS by PF_MEMALLOC_NOFS I didn't realized that xfs doesn't
need only the NOFS semantic but also the transaction tracking so this
cannot be a single bit only. So it has to be added back. But
PF_MEMALLOC_NOFS needs to stay for the scoped NOFS semantic.

Hope this clarifies it a bit.
--
Michal Hocko
SUSE Labs

2020-06-25 23:35:46

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Thu, Jun 25, 2020 at 10:36:11PM +0200, Michal Hocko wrote:
> On Thu 25-06-20 11:48:32, Darrick J. Wong wrote:
> > On Thu, Jun 25, 2020 at 12:31:16PM +0100, Matthew Wilcox (Oracle) wrote:
> > > I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
> > > for an upcoming patch series, and Jens also wants it for non-blocking
> > > io_uring. It turns out we already have dm-bufio which could benefit
> > > from memalloc_nowait, so it may as well go into the tree now.
> > >
> > > The biggest problem is that we're basically out of PF_ flags, so we need
> > > to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
> > > out the PF_ flags are really supposed to be used for flags which are
> > > accessed from other tasks, and the MEMALLOC flags are only going to
> > > be used by this task. So shuffling everything around frees up some PF
> > > flags and generally makes the world a better place.
> >
> > So, uh, how does this intersect with the patch "xfs: reintroduce
> > PF_FSTRANS for transaction reservation recursion protection" that
> > re-adds PF_TRANS because uh I guess we lost some subtlety or another at
> > some point?
>
> This is independent, really. It just relocates the NOFS flag. PF_TRANS
> is reintroduced for a different reason. When I have replaced the
> original PF_TRANS by PF_MEMALLOC_NOFS I didn't realized that xfs doesn't
> need only the NOFS semantic but also the transaction tracking so this
> cannot be a single bit only. So it has to be added back. But
> PF_MEMALLOC_NOFS needs to stay for the scoped NOFS semantic.

If XFS needs to track transactions, why doesn't it use
current->journal_info like btrfs/ceph/ext4/gfs2/nilfs2/reiserfs?

2020-06-26 16:37:49

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

Hi

I suggest to join memalloc_noio and memalloc_nofs into just one flag that
prevents both filesystem recursion and i/o recursion.

Note that any I/O can recurse into a filesystem via the loop device, thus
it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
and PF_MEMALLOC_NOIO is not set.

Mikulas

On Thu, 25 Jun 2020, Matthew Wilcox (Oracle) wrote:

> I want a memalloc_nowait like we have memalloc_noio and memalloc_nofs
> for an upcoming patch series, and Jens also wants it for non-blocking
> io_uring. It turns out we already have dm-bufio which could benefit
> from memalloc_nowait, so it may as well go into the tree now.
>
> The biggest problem is that we're basically out of PF_ flags, so we need
> to find somewhere else to store the PF_MEMALLOC_NOWAIT flag. It turns
> out the PF_ flags are really supposed to be used for flags which are
> accessed from other tasks, and the MEMALLOC flags are only going to
> be used by this task. So shuffling everything around frees up some PF
> flags and generally makes the world a better place.
>
> Patch series also available from
> http://git.infradead.org/users/willy/linux.git/shortlog/refs/heads/memalloc
>
> Matthew Wilcox (Oracle) (6):
> mm: Replace PF_MEMALLOC_NOIO with memalloc_noio
> mm: Add become_kswapd and restore_kswapd
> xfs: Convert to memalloc_nofs_save
> mm: Replace PF_MEMALLOC_NOFS with memalloc_nofs
> mm: Replace PF_MEMALLOC_NOIO with memalloc_nocma
> mm: Add memalloc_nowait
>
> drivers/block/loop.c | 3 +-
> drivers/md/dm-bufio.c | 30 ++++--------
> drivers/md/dm-zoned-metadata.c | 5 +-
> fs/iomap/buffered-io.c | 2 +-
> fs/xfs/kmem.c | 2 +-
> fs/xfs/libxfs/xfs_btree.c | 14 +++---
> fs/xfs/xfs_aops.c | 4 +-
> fs/xfs/xfs_buf.c | 2 +-
> fs/xfs/xfs_linux.h | 6 ---
> fs/xfs/xfs_trans.c | 14 +++---
> fs/xfs/xfs_trans.h | 2 +-
> include/linux/sched.h | 7 +--
> include/linux/sched/mm.h | 84 ++++++++++++++++++++++++++--------
> kernel/sys.c | 8 ++--
> mm/vmscan.c | 16 +------
> 15 files changed, 105 insertions(+), 94 deletions(-)
>
> --
> 2.27.0
>

2020-06-26 23:12:31

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> Hi
>
> I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> prevents both filesystem recursion and i/o recursion.
>
> Note that any I/O can recurse into a filesystem via the loop device, thus
> it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> and PF_MEMALLOC_NOIO is not set.

Correct me if I'm wrong, but I think that will prevent swapping from
GFP_NOFS memory reclaim contexts. IOWs, this will substantially
change the behaviour of the memory reclaim system under sustained
GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
quite common, so I really don't think we want to telling memory
reclaim "you can't do IO at all" when all we are trying to do is
prevent recursion back into the same filesystem.

Given that the loop device IO path already operates under
memalloc_noio context, (i.e. the recursion restriction is applied in
only the context that needs is) I see no reason for making that a
global reclaim limitation....

In reality, we need to be moving the other way with GFP_NOFS - to
fine grained anti-recursion contexts, not more broad contexts.

That is, GFP_NOFS prevents recursion into any filesystem, not just
the one that we are actively operating on and needing to prevent
recursion back into. We can safely have reclaim do relcaim work on
other filesysetms without fear of recursion deadlocks, but the
memory reclaim infrastructure does not provide that capability.(*)

e.g. if memalloc_nofs_save() took a reclaim context structure that
the filesystem put the superblock, the superblock's nesting depth
(because layering on loop devices can create cross-filesystem
recursion dependencies), and any other filesyetm private data the
fs wanted to add, we could actually have reclaim only avoid reclaim
from filesytsems where there is a deadlock possiblity. e.g:

- superblock nesting depth is different, apply GFP_NOFS
reclaim unconditionally
- superblock different apply GFP_KERNEL reclaim
- superblock the same, pass context to filesystem to
decide if reclaim from the sueprblock is safe.

At this point, we get memory reclaim able to always be able to
reclaim from filesystems that are not at risk of recursion
deadlocks. Direct reclaim is much more likely to be able to make
progress now because it is much less restricted in what it can
reclaim. That's going to make direct relcaim faster and more
efficient, and taht's the ultimate goal we are aiming to acheive
here...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2020-06-27 13:09:59

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*



On Sat, 27 Jun 2020, Dave Chinner wrote:

> On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> > Hi
> >
> > I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> > prevents both filesystem recursion and i/o recursion.
> >
> > Note that any I/O can recurse into a filesystem via the loop device, thus
> > it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> > and PF_MEMALLOC_NOIO is not set.
>
> Correct me if I'm wrong, but I think that will prevent swapping from
> GFP_NOFS memory reclaim contexts.

Yes.

> IOWs, this will substantially
> change the behaviour of the memory reclaim system under sustained
> GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
> quite common, so I really don't think we want to telling memory
> reclaim "you can't do IO at all" when all we are trying to do is
> prevent recursion back into the same filesystem.

So, we can define __GFP_ONLY_SWAP_IO and __GFP_IO.

> Given that the loop device IO path already operates under
> memalloc_noio context, (i.e. the recursion restriction is applied in
> only the context that needs is) I see no reason for making that a
> global reclaim limitation....

I think this is a problem.

Suppose that a filesystem does GFP_NOFS allocation, the allocation
triggers an IO and waits for it to finish, the loop device driver
redirects the IO to the same filesystem that did the GFP_NOFS allocation.

I saw this deadlock in the past in the dm-bufio subsystem - see the commit
9d28eb12447ee08bb5d1e8bb3195cf20e1ecd1c0 that fixed it.

Other subsystems that do IO in GFP_NOFS context may deadlock just like
bufio.

Mikulas

2020-06-29 00:37:09

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Sat, Jun 27, 2020 at 09:09:09AM -0400, Mikulas Patocka wrote:
>
>
> On Sat, 27 Jun 2020, Dave Chinner wrote:
>
> > On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> > > Hi
> > >
> > > I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> > > prevents both filesystem recursion and i/o recursion.
> > >
> > > Note that any I/O can recurse into a filesystem via the loop device, thus
> > > it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> > > and PF_MEMALLOC_NOIO is not set.
> >
> > Correct me if I'm wrong, but I think that will prevent swapping from
> > GFP_NOFS memory reclaim contexts.
>
> Yes.
>
> > IOWs, this will substantially
> > change the behaviour of the memory reclaim system under sustained
> > GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
> > quite common, so I really don't think we want to telling memory
> > reclaim "you can't do IO at all" when all we are trying to do is
> > prevent recursion back into the same filesystem.
>
> So, we can define __GFP_ONLY_SWAP_IO and __GFP_IO.

Uh, why?

Exactly what problem are you trying to solve here?

> > Given that the loop device IO path already operates under
> > memalloc_noio context, (i.e. the recursion restriction is applied in
> > only the context that needs is) I see no reason for making that a
> > global reclaim limitation....
>
> I think this is a problem.
>
> Suppose that a filesystem does GFP_NOFS allocation, the allocation
> triggers an IO and waits for it to finish, the loop device driver
> redirects the IO to the same filesystem that did the GFP_NOFS allocation.

The loop device IO path is under memalloc_noio. By -definition-,
allocations in that context cannot recurse back into filesystem
level reclaim.

So either your aren't explaining the problem you are trying to solve
clearly, or you're talking about allocations in the IO path that are
broken because they don't use GFP_NOIO correctly...

> I saw this deadlock in the past in the dm-bufio subsystem - see the commit
> 9d28eb12447ee08bb5d1e8bb3195cf20e1ecd1c0 that fixed it.

2014?

/me looks closer.

Hmmm. Only sent to dm-devel, no comments, no review, just merged.
No surprise that nobody else actually knows about this commit. Well,
time to review it ~6 years after it was merged....

| dm-bufio tested for __GFP_IO. However, dm-bufio can run on a loop block
| device that makes calls into the filesystem. If __GFP_IO is present and
| __GFP_FS isn't, dm-bufio could still block on filesystem operations if it
| runs on a loop block device.

OK, so from an architectural POV, this commit is fundamentally
broken - block/device layer allocation should not allow relcaim
recursion into filesystems because filesystems are dependent on
the block layer making forwards progress. This commit is trying to
work around the loop device doing GFP_KERNEL/GFP_NOFS context
allocation back end IO path of the loop device. This part of the
loop device is a block device, so needs to run under GFP_NOIO
context.

IOWs, this commit just papered over the reclaim context layering
violation in the loop device by trying to avoid blocking filesystem
IO in the dm-bufio shrinker context just in case it was IO from a
loop device that was incorrectly tagged as GFP_KERNEL.

So, step forward 5 years to 2019, and this change was made:

commit d0a255e795ab976481565f6ac178314b34fbf891
Author: Mikulas Patocka <[email protected]>
Date: Thu Aug 8 11:17:01 2019 -0400

loop: set PF_MEMALLOC_NOIO for the worker thread

A deadlock with this stacktrace was observed.

The loop thread does a GFP_KERNEL allocation, it calls into dm-bufio
shrinker and the shrinker depends on I/O completion in the dm-bufio
subsystem.

In order to fix the deadlock (and other similar ones), we set the flag
PF_MEMALLOC_NOIO at loop thread entry.

PID: 474 TASK: ffff8813e11f4600 CPU: 10 COMMAND: "kswapd0"
#0 [ffff8813dedfb938] __schedule at ffffffff8173f405
#1 [ffff8813dedfb990] schedule at ffffffff8173fa27
#2 [ffff8813dedfb9b0] schedule_timeout at ffffffff81742fec
#3 [ffff8813dedfba60] io_schedule_timeout at ffffffff8173f186
#4 [ffff8813dedfbaa0] bit_wait_io at ffffffff8174034f
#5 [ffff8813dedfbac0] __wait_on_bit at ffffffff8173fec8
#6 [ffff8813dedfbb10] out_of_line_wait_on_bit at ffffffff8173ff81
#7 [ffff8813dedfbb90] __make_buffer_clean at ffffffffa038736f [dm_bufio]
#8 [ffff8813dedfbbb0] __try_evict_buffer at ffffffffa0387bb8 [dm_bufio]
#9 [ffff8813dedfbbd0] dm_bufio_shrink_scan at ffffffffa0387cc3 [dm_bufio]
#10 [ffff8813dedfbc40] shrink_slab at ffffffff811a87ce
#11 [ffff8813dedfbd30] shrink_zone at ffffffff811ad778
#12 [ffff8813dedfbdc0] kswapd at ffffffff811ae92f
#13 [ffff8813dedfbec0] kthread at ffffffff810a8428
#14 [ffff8813dedfbf50] ret_from_fork at ffffffff81745242

PID: 14127 TASK: ffff881455749c00 CPU: 11 COMMAND: "loop1"
#0 [ffff88272f5af228] __schedule at ffffffff8173f405
#1 [ffff88272f5af280] schedule at ffffffff8173fa27
#2 [ffff88272f5af2a0] schedule_preempt_disabled at ffffffff8173fd5e
#3 [ffff88272f5af2b0] __mutex_lock_slowpath at ffffffff81741fb5
#4 [ffff88272f5af330] mutex_lock at ffffffff81742133
#5 [ffff88272f5af350] dm_bufio_shrink_count at ffffffffa03865f9 [dm_bufio]
#6 [ffff88272f5af380] shrink_slab at ffffffff811a86bd
#7 [ffff88272f5af470] shrink_zone at ffffffff811ad778
#8 [ffff88272f5af500] do_try_to_free_pages at ffffffff811adb34
#9 [ffff88272f5af590] try_to_free_pages at ffffffff811adef8
#10 [ffff88272f5af610] __alloc_pages_nodemask at ffffffff811a09c3
#11 [ffff88272f5af710] alloc_pages_current at ffffffff811e8b71
#12 [ffff88272f5af760] new_slab at ffffffff811f4523
#13 [ffff88272f5af7b0] __slab_alloc at ffffffff8173a1b5
#14 [ffff88272f5af880] kmem_cache_alloc at ffffffff811f484b
#15 [ffff88272f5af8d0] do_blockdev_direct_IO at ffffffff812535b3
#16 [ffff88272f5afb00] __blockdev_direct_IO at ffffffff81255dc3
#17 [ffff88272f5afb30] xfs_vm_direct_IO at ffffffffa01fe3fc [xfs]
#18 [ffff88272f5afb90] generic_file_read_iter at ffffffff81198994
#19 [ffff88272f5afc50] __dta_xfs_file_read_iter_2398 at ffffffffa020c970 [xfs]
#20 [ffff88272f5afcc0] lo_rw_aio at ffffffffa0377042 [loop]
#21 [ffff88272f5afd70] loop_queue_work at ffffffffa0377c3b [loop]
#22 [ffff88272f5afe60] kthread_worker_fn at ffffffff810a8a0c
#23 [ffff88272f5afec0] kthread at ffffffff810a8428
#24 [ffff88272f5aff50] ret_from_fork at ffffffff81745242

Signed-off-by: Mikulas Patocka <[email protected]>
Cc: [email protected]
Signed-off-by: Jens Axboe <[email protected]>

That's the *same bug* as the 2014 commit was trying to address. But
because the 2014 commit didn't actually address the underlying
architectural layering issue in the loop device, the problem was
still there.

The 2019 commit corrected the allocation context of the loop device
to unconditionally use GFP_NOIO, and so prevents recursion back into
both the filesystem and the block/device layers from the loop device
IO path. Hence reclaim contexts are layered correctly again, and the
deadlock in the dm-bufio code goes away.

And that means you probably should revert the 2014 commit because
it's a nasty layering violation that never should have been made....

> Other subsystems that do IO in GFP_NOFS context may deadlock just like
> bufio.

Then they are as buggy as the dm-bufio code. Shrinkers needing to be
aware of reclaim contexts above the layer the shrinker belongs to is
a Big Red Flag that indicates something is violating reclaim
recursion rules. i.e. that an allocation is simply not using
GFP_NOFS/GFP_NOIO correctly. That's not a bug in the shrinker,
that's a bug in the code that is doing the memory allocation.

I still don't see a need for more reclaim recursion layers here...

Cheers,

Dave.
--
Dave Chinner
[email protected]

2020-06-29 18:48:03

by Mikulas Patocka

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*



On Mon, 29 Jun 2020, Dave Chinner wrote:

> On Sat, Jun 27, 2020 at 09:09:09AM -0400, Mikulas Patocka wrote:
> >
> >
> > On Sat, 27 Jun 2020, Dave Chinner wrote:
> >
> > > On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> > > > Hi
> > > >
> > > > I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> > > > prevents both filesystem recursion and i/o recursion.
> > > >
> > > > Note that any I/O can recurse into a filesystem via the loop device, thus
> > > > it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> > > > and PF_MEMALLOC_NOIO is not set.
> > >
> > > Correct me if I'm wrong, but I think that will prevent swapping from
> > > GFP_NOFS memory reclaim contexts.
> >
> > Yes.
> >
> > > IOWs, this will substantially
> > > change the behaviour of the memory reclaim system under sustained
> > > GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
> > > quite common, so I really don't think we want to telling memory
> > > reclaim "you can't do IO at all" when all we are trying to do is
> > > prevent recursion back into the same filesystem.
> >
> > So, we can define __GFP_ONLY_SWAP_IO and __GFP_IO.
>
> Uh, why?
>
> Exactly what problem are you trying to solve here?

This:

1. The filesystem does a GFP_NOFS allocation.
2. The allocation calls directly a dm-bufio shrinker.
3. The dm-bufio shrinker sees that there is __GFP_IO set, so it assumes
that it can do I/O. It selects some dirty buffers, writes them back and
waits for the I/O to finish.
4. The dirty buffers belong to a loop device.
5. The loop device thread calls the filesystem that did the GFP_NOFS
allocation in step 1 (and that is still waiting for the allocation to
succeed).

Note that setting PF_MEMALLOC_NOIO on the loop thread won't help with this
deadlock.

Do you argue that this is a bug in dm-bufio? Or a bug in the kernel? Or
that it can't happen?

> > I saw this deadlock in the past in the dm-bufio subsystem - see the commit
> > 9d28eb12447ee08bb5d1e8bb3195cf20e1ecd1c0 that fixed it.
>
> 2014?
>
> /me looks closer.
>
> Hmmm. Only sent to dm-devel, no comments, no review, just merged.
> No surprise that nobody else actually knows about this commit. Well,
> time to review it ~6 years after it was merged....
>
> | dm-bufio tested for __GFP_IO. However, dm-bufio can run on a loop block
> | device that makes calls into the filesystem. If __GFP_IO is present and
> | __GFP_FS isn't, dm-bufio could still block on filesystem operations if it
> | runs on a loop block device.
>
> OK, so from an architectural POV, this commit is fundamentally
> broken - block/device layer allocation should not allow relcaim
> recursion into filesystems because filesystems are dependent on
> the block layer making forwards progress. This commit is trying to
> work around the loop device doing GFP_KERNEL/GFP_NOFS context
> allocation back end IO path of the loop device. This part of the
> loop device is a block device, so needs to run under GFP_NOIO
> context.

I agree that it is broken, but it fixes the above deadlock.

Mikulas

2020-06-29 20:59:06

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Sat 27-06-20 09:08:47, Dave Chinner wrote:
> On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> > Hi
> >
> > I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> > prevents both filesystem recursion and i/o recursion.
> >
> > Note that any I/O can recurse into a filesystem via the loop device, thus
> > it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> > and PF_MEMALLOC_NOIO is not set.
>
> Correct me if I'm wrong, but I think that will prevent swapping from
> GFP_NOFS memory reclaim contexts. IOWs, this will substantially
> change the behaviour of the memory reclaim system under sustained
> GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
> quite common, so I really don't think we want to telling memory
> reclaim "you can't do IO at all" when all we are trying to do is
> prevent recursion back into the same filesystem.
>
> Given that the loop device IO path already operates under
> memalloc_noio context, (i.e. the recursion restriction is applied in
> only the context that needs is) I see no reason for making that a
> global reclaim limitation....
>
> In reality, we need to be moving the other way with GFP_NOFS - to
> fine grained anti-recursion contexts, not more broad contexts.

Absolutely agreed! It is not really hard to see system struggling due to
heavy FS metadata workload while there are objects which could be
reclaimed.

> That is, GFP_NOFS prevents recursion into any filesystem, not just
> the one that we are actively operating on and needing to prevent
> recursion back into. We can safely have reclaim do relcaim work on
> other filesysetms without fear of recursion deadlocks, but the
> memory reclaim infrastructure does not provide that capability.(*)
>
> e.g. if memalloc_nofs_save() took a reclaim context structure that
> the filesystem put the superblock, the superblock's nesting depth
> (because layering on loop devices can create cross-filesystem
> recursion dependencies), and any other filesyetm private data the
> fs wanted to add, we could actually have reclaim only avoid reclaim
> from filesytsems where there is a deadlock possiblity. e.g:
>
> - superblock nesting depth is different, apply GFP_NOFS
> reclaim unconditionally
> - superblock different apply GFP_KERNEL reclaim
> - superblock the same, pass context to filesystem to
> decide if reclaim from the sueprblock is safe.
>
> At this point, we get memory reclaim able to always be able to
> reclaim from filesystems that are not at risk of recursion
> deadlocks. Direct reclaim is much more likely to be able to make
> progress now because it is much less restricted in what it can
> reclaim. That's going to make direct relcaim faster and more
> efficient, and taht's the ultimate goal we are aiming to acheive
> here...

Yes, we have discussed something like that few years back at LSFMM IIRC.
The scoped NOFS/NOIO api was just a first step to reduce explicit
NOFS/NOIO usage with a hope that we will get no-recursion entry points
much more well defined and get rid of many instances where "this is a fs
code so it has to use NOFS gfp mask".

Some of that has happened and that is really great. On the other hand
many people still like to use that api as a workaround for an immediate
problem because no-recursion scopes are much harder to recognize unless
you are supper familiar with the specific fs/IO layer implementation.
So this is definitely not a project for somebody to go over all code and
just do the clean up.

Thanks!
--
Michal Hocko
SUSE Labs

2020-06-29 22:37:00

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH 0/6] Overhaul memalloc_no*

On Mon, Jun 29, 2020 at 09:43:23AM -0400, Mikulas Patocka wrote:
> On Mon, 29 Jun 2020, Dave Chinner wrote:
> > On Sat, Jun 27, 2020 at 09:09:09AM -0400, Mikulas Patocka wrote:
> > > On Sat, 27 Jun 2020, Dave Chinner wrote:
> > > > On Fri, Jun 26, 2020 at 11:02:19AM -0400, Mikulas Patocka wrote:
> > > > > Hi
> > > > >
> > > > > I suggest to join memalloc_noio and memalloc_nofs into just one flag that
> > > > > prevents both filesystem recursion and i/o recursion.
> > > > >
> > > > > Note that any I/O can recurse into a filesystem via the loop device, thus
> > > > > it doesn't make much sense to have a context where PF_MEMALLOC_NOFS is set
> > > > > and PF_MEMALLOC_NOIO is not set.
> > > >
> > > > Correct me if I'm wrong, but I think that will prevent swapping from
> > > > GFP_NOFS memory reclaim contexts.
> > >
> > > Yes.
> > >
> > > > IOWs, this will substantially
> > > > change the behaviour of the memory reclaim system under sustained
> > > > GFP_NOFS memory pressure. Sustained GFP_NOFS memory pressure is
> > > > quite common, so I really don't think we want to telling memory
> > > > reclaim "you can't do IO at all" when all we are trying to do is
> > > > prevent recursion back into the same filesystem.
> > >
> > > So, we can define __GFP_ONLY_SWAP_IO and __GFP_IO.
> >
> > Uh, why?
> >
> > Exactly what problem are you trying to solve here?
>
> This:
>
> 1. The filesystem does a GFP_NOFS allocation.
> 2. The allocation calls directly a dm-bufio shrinker.
> 3. The dm-bufio shrinker sees that there is __GFP_IO set, so it assumes
> that it can do I/O. It selects some dirty buffers, writes them back and
> waits for the I/O to finish.

And so you are doing IO in a GFP_NOFS context because someone thought
the block layer can't recurse back into filesystems? That's a broken
assumption and has been since the loop device was introduced over a
couple of decades ago. I mean, the dm-bufio IO submission path uses
GFP_NOIO for obvious reasons, but once it's in the next device down
it loses all control of the submission context.

This is what I mean about "looking at reclaim contexts above the
current layer is a Big Red Flag"? The fundamental assumption of
dm-bufio that it can issue IO in GFP_NOFS context and not have a
lower layer recurse back into a filesystem has always been
incorrect. Just because the loop device now does GFP_NOIO
allocation, that doesn't mean what dm-bufio is doing in this
shrinker is correct or valid.

Because, as you point out:

> 4. The dirty buffers belong to a loop device.
> 5. The loop device thread calls the filesystem that did the GFP_NOFS
> allocation in step 1 (and that is still waiting for the allocation to
> succeed).
> Note that setting PF_MEMALLOC_NOIO on the loop thread won't help with this
> deadlock.

Right, re-entering the filesystem might block on a lock, IO, memory
allocation, journal space reservation, etc. Indeed, it might not
even be able to issue transactions because the allocating context is
using GFP_NOFS because it is already running a transaction.

> Do you argue that this is a bug in dm-bufio? Or a bug in the kernel? Or
> that it can't happen?

That's a bug in dm-bufio - dm is a layered block device and so has
_always_ been able to have filesystems both above and below
it in the storage stack. i.e. the assumption that there is no
filesystem context under the DM layers has always been wrong.

i.e. the memory reclaim context specfically directed dm-bufio that
whatever the shrinker does, it must not recurse into the filesystem
layer. It is the responsibility of the shrinker to obey the
constraints it was given by memory reclaim, and the dm-bufio
shrinker's assumption that there cannot be a filesystem below the DM
device violates this directive because, quite clearly, there can be
filesystems underneath DM devices.

IOWs, assuming that you can issue and block on IO from a -layered
block device- in GFP_NOFS shrinker context is flawed. i.e. Anything
that presents as a block device that is layered on top of another
block device can recurse into a filesystem as they can sit on top of
a loop device. This has always been the case, and that means the
assumptions the dm-bufio shrinker is making about what it can do in
GFP_NOFS shrinker context has always been incorrect.

Remember that I explained "you should not block kswapd" in this
shrinker a year ago?

| What follows from that, and is pertinent for in this situation, is
| that if you don't block kswapd, then other reclaim contexts are not
| going to get stuck waiting for it regardless of the reclaim context
| they use.

https://lore.kernel.org/linux-fsdevel/[email protected]/

If you did that when I suggested it, this problem would be solved.
i.e. The only way to fix this problem once adn for all is to stop
using the shrinker as a mechanism to issue and wait on IO. If you
need background writeback of dirty buffers, do it from a
WQ_MEM_RECLAIM workqueue that isn't directly in the memory reclaim
path and so can issue writeback and block safely from a GFP_KERNEL
context. Kick the workqueue from the shrinker context, but get rid
of the IO submission and waiting from the shrinker and all the
GFP_NOFS memory reclaim recursion problems go away.

Cheers,

Dave.
--
Dave Chinner
[email protected]

2020-07-03 14:27:42

by Mikulas Patocka

[permalink] [raw]
Subject: [PATCH] dm-bufio: do cleanup from a workqueue



On Tue, 30 Jun 2020, Dave Chinner wrote:

> https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> If you did that when I suggested it, this problem would be solved.
> i.e. The only way to fix this problem once adn for all is to stop
> using the shrinker as a mechanism to issue and wait on IO. If you
> need background writeback of dirty buffers, do it from a
> WQ_MEM_RECLAIM workqueue that isn't directly in the memory reclaim
> path and so can issue writeback and block safely from a GFP_KERNEL
> context. Kick the workqueue from the shrinker context, but get rid
> of the IO submission and waiting from the shrinker and all the
> GFP_NOFS memory reclaim recursion problems go away.
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> [email protected]

Hi

This is a patch that moves buffer cleanup to a workqueue. Please review
it.

Mikulas



From: Mikulas Patocka <[email protected]>

kswapd should not block because it degrades system performance.
So, move reclaim of buffers to a workqueue.

Signed-off-by: Mikulas Patocka <[email protected]>

---
drivers/md/dm-bufio.c | 60 ++++++++++++++++++++++++++++++++++----------------
1 file changed, 41 insertions(+), 19 deletions(-)

Index: linux-2.6/drivers/md/dm-bufio.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-bufio.c 2020-07-03 14:07:43.000000000 +0200
+++ linux-2.6/drivers/md/dm-bufio.c 2020-07-03 15:35:23.000000000 +0200
@@ -108,7 +108,10 @@ struct dm_bufio_client {
int async_write_error;

struct list_head client_list;
+
struct shrinker shrinker;
+ struct work_struct shrink_work;
+ atomic_long_t need_shrink;
};

/*
@@ -1634,8 +1637,7 @@ static unsigned long get_retain_buffers(
return retain_bytes;
}

-static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static void __scan(struct dm_bufio_client *c)
{
int l;
struct dm_buffer *b, *tmp;
@@ -1646,42 +1648,58 @@ static unsigned long __scan(struct dm_bu

for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- if (__try_evict_buffer(b, gfp_mask))
+ if (count - freed <= retain_target)
+ atomic_long_set(&c->need_shrink, 0);
+ if (!atomic_long_read(&c->need_shrink))
+ return;
+ if (__try_evict_buffer(b, GFP_KERNEL)) {
+ atomic_long_dec(&c->need_shrink);
freed++;
- if (!--nr_to_scan || ((count - freed) <= retain_target))
- return freed;
+ }
cond_resched();
}
}
- return freed;
}

-static unsigned long
-dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static void shrink_work(struct work_struct *w)
+{
+ struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
+
+ dm_bufio_lock(c);
+ __scan(c);
+ dm_bufio_unlock(c);
+}
+
+static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c;
- unsigned long freed;

c = container_of(shrink, struct dm_bufio_client, shrinker);
- if (sc->gfp_mask & __GFP_FS)
- dm_bufio_lock(c);
- else if (!dm_bufio_trylock(c))
- return SHRINK_STOP;
+ atomic_long_add(sc->nr_to_scan, &c->need_shrink);
+ queue_work(dm_bufio_wq, &c->shrink_work);

- freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
- dm_bufio_unlock(c);
- return freed;
+ return sc->nr_to_scan;
}

-static unsigned long
-dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
READ_ONCE(c->n_buffers[LIST_DIRTY]);
unsigned long retain_target = get_retain_buffers(c);
+ unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
+
+ if (unlikely(count < retain_target))
+ count = 0;
+ else
+ count -= retain_target;

- return (count < retain_target) ? 0 : (count - retain_target);
+ if (unlikely(count < queued_for_cleanup))
+ count = 0;
+ else
+ count -= queued_for_cleanup;
+
+ return count;
}

/*
@@ -1772,6 +1790,9 @@ struct dm_bufio_client *dm_bufio_client_
__free_buffer_wake(b);
}

+ INIT_WORK(&c->shrink_work, shrink_work);
+ atomic_long_set(&c->need_shrink, 0);
+
c->shrinker.count_objects = dm_bufio_shrink_count;
c->shrinker.scan_objects = dm_bufio_shrink_scan;
c->shrinker.seeks = 1;
@@ -1817,6 +1838,7 @@ void dm_bufio_client_destroy(struct dm_b
drop_buffers(c);

unregister_shrinker(&c->shrinker);
+ flush_work(&c->shrink_work);

mutex_lock(&dm_bufio_clients_lock);