Hi,
this series aims to unify the setting and clearing of PF_MEMALLOC, which
prevents recursive reclaim. There are some places that clear the flag
unconditionally from current->flags, which may result in clearing a
pre-existing flag. This already resulted in a bug report that Patch 1 fixes
(without the new helpers, to make backporting easier). Patch 2 introduces the
new helpers, modelled after existing memalloc_noio_* and memalloc_nofs_*
helpers, and converts mm core to use them. Patches 3 and 4 convert non-mm code.
Based on next-20170404.
Vlastimil Babka (4):
mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC
mm: introduce memalloc_noreclaim_{save,restore}
treewide: convert PF_MEMALLOC manipulations to new helpers
mtd: nand: nandsim: convert to memalloc_noreclaim_*()
drivers/block/nbd.c | 7 ++++---
drivers/mtd/nand/nandsim.c | 29 +++++++++--------------------
drivers/scsi/iscsi_tcp.c | 7 ++++---
include/linux/sched/mm.h | 12 ++++++++++++
mm/page_alloc.c | 10 ++++++----
mm/vmscan.c | 17 +++++++++++------
net/core/dev.c | 7 ++++---
net/core/sock.c | 7 ++++---
8 files changed, 54 insertions(+), 42 deletions(-)
--
2.12.2
We now have memalloc_noreclaim_{save,restore} helpers for robust setting and
clearing of PF_MEMALLOC. Let's convert the code which was using the generic
tsk_restore_flags(). No functional change.
Signed-off-by: Vlastimil Babka <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Lee Duncan <[email protected]>
Cc: Chris Leech <[email protected]>
Cc: "David S. Miller" <[email protected]>
Cc: Eric Dumazet <[email protected]>
---
drivers/block/nbd.c | 7 ++++---
drivers/scsi/iscsi_tcp.c | 7 ++++---
net/core/dev.c | 7 ++++---
net/core/sock.c | 7 ++++---
4 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 03ae72985c79..929fc548c7fb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
@@ -210,7 +211,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
struct socket *sock = nbd->socks[index]->sock;
int result;
struct msghdr msg;
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
if (unlikely(!sock)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
@@ -221,7 +222,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
msg.msg_iter = *iter;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
do {
sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
msg.msg_name = NULL;
@@ -244,7 +245,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
*sent += result;
} while (msg_data_left(&msg));
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
return result;
}
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 4228aba1f654..4842fc0e809d 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -30,6 +30,7 @@
#include <linux/types.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
@@ -371,10 +372,10 @@ static inline int iscsi_sw_tcp_xmit_qlen(struct iscsi_conn *conn)
static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
{
struct iscsi_conn *conn = task->conn;
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
int rc = 0;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
while (iscsi_sw_tcp_xmit_qlen(conn)) {
rc = iscsi_sw_tcp_xmit(conn);
@@ -387,7 +388,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
rc = 0;
}
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
return rc;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index fde8b3f7136b..e0705a126b24 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -81,6 +81,7 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/mutex.h>
#include <linux/string.h>
#include <linux/mm.h>
@@ -4227,7 +4228,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
/*
* PFMEMALLOC skbs are special, they should
@@ -4238,9 +4239,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
ret = __netif_receive_skb_core(skb, true);
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
} else
ret = __netif_receive_skb_core(skb, false);
diff --git a/net/core/sock.c b/net/core/sock.c
index 392f9b6f96e2..0b2d06b4c308 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -102,6 +102,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -372,14 +373,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
int ret;
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
/* these should have been dropped before queueing */
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
ret = sk->sk_backlog_rcv(sk, skb);
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
--
2.12.2
The previous patch has shown that simply setting and clearing PF_MEMALLOC in
current->flags can result in wrongly clearing a pre-existing PF_MEMALLOC flag
and potentially lead to recursive reclaim. Let's introduce helpers that support
proper nesting by saving the previous stat of the flag, similar to the existing
memalloc_noio_* and memalloc_nofs_* helpers. Convert existing setting/clearing
of PF_MEMALLOC within mm to the new helpers.
There are no known issues with the converted code, but the change makes it more
robust.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Vlastimil Babka <[email protected]>
---
include/linux/sched/mm.h | 12 ++++++++++++
mm/page_alloc.c | 11 ++++++-----
mm/vmscan.c | 17 +++++++++++------
3 files changed, 29 insertions(+), 11 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9daabe138c99..2b24a6974847 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -191,4 +191,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
+ return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
#endif /* _LINUX_SCHED_MM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b84e6ffbe756..037e32dccd7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3288,15 +3288,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
- unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
+ unsigned int noreclaim_flag;
if (!order)
return NULL;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
- current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
+ memalloc_noreclaim_restore(noreclaim_flag);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
@@ -3443,12 +3443,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct reclaim_state reclaim_state;
int progress;
+ unsigned int noreclaim_flag;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3458,7 +3459,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- current->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 58615bb27f2f..ff63b91a0f48 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2992,6 +2992,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist;
unsigned long nr_reclaimed;
int nid;
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
@@ -3018,9 +3019,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
sc.gfp_mask,
sc.reclaim_idx);
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- current->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -3544,8 +3545,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
struct task_struct *p = current;
unsigned long nr_reclaimed;
+ unsigned int noreclaim_flag;
- p->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
lockdep_set_current_reclaim_state(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3554,7 +3556,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
}
@@ -3719,6 +3721,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int classzone_idx = gfp_zone(gfp_mask);
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
@@ -3736,7 +3739,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
* and we also need to be able to write out pages for RECLAIM_WRITE
* and RECLAIM_UNMAP.
*/
- p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ noreclaim_flag = memalloc_noreclaim_save();
+ p->flags |= PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3752,7 +3756,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
- current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ current->flags &= ~PF_SWAPWRITE;
+ memalloc_noreclaim_restore(noreclaim_flag);
lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
--
2.12.2
The function __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent
deadlock during page migration by lock_page() (see the comment in
__unmap_and_move()). Then it unconditionally clears the flag, which can clear a
pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a
problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct
compaction handling in slowpath"), because direct compation was called only
after direct reclaim, which was skipped when PF_MEMALLOC flag was set.
Even now it's only a theoretical issue, as the new callsite of
__alloc_pages_direct_compact() is reached only for costly orders and when
gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in
gfp_flags or in_interrupt() is true. There is no such known context, but let's
play it safe and make __alloc_pages_direct_compact() robust for cases where
PF_MEMALLOC is already set.
Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
Reported-by: Andrey Ryabinin <[email protected]>
Signed-off-by: Vlastimil Babka <[email protected]>
Cc: <[email protected]>
---
mm/page_alloc.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589f8be53be..b84e6ffbe756 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3288,6 +3288,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
+ unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
if (!order)
return NULL;
@@ -3295,7 +3296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
- current->flags &= ~PF_MEMALLOC;
+ current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
--
2.12.2
Nandsim has own functions set_memalloc() and clear_memalloc() for robust
setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
No functional change.
Signed-off-by: Vlastimil Babka <[email protected]>
Cc: Boris Brezillon <[email protected]>
Cc: Richard Weinberger <[email protected]>
---
drivers/mtd/nand/nandsim.c | 29 +++++++++--------------------
1 file changed, 9 insertions(+), 20 deletions(-)
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index cef818f535ed..03a0d057bf2f 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -40,6 +40,7 @@
#include <linux/list.h>
#include <linux/random.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/seq_file.h>
@@ -1368,31 +1369,18 @@ static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t
return 0;
}
-static int set_memalloc(void)
-{
- if (current->flags & PF_MEMALLOC)
- return 0;
- current->flags |= PF_MEMALLOC;
- return 1;
-}
-
-static void clear_memalloc(int memalloc)
-{
- if (memalloc)
- current->flags &= ~PF_MEMALLOC;
-}
-
static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos)
{
ssize_t tx;
- int err, memalloc;
+ int err;
+ unsigned int noreclaim_flag;
err = get_pages(ns, file, count, pos);
if (err)
return err;
- memalloc = set_memalloc();
+ noreclaim_flag = memalloc_noreclaim_save();
tx = kernel_read(file, pos, buf, count);
- clear_memalloc(memalloc);
+ memalloc_noreclaim_restore(noreclaim_flag);
put_pages(ns);
return tx;
}
@@ -1400,14 +1388,15 @@ static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_
static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos)
{
ssize_t tx;
- int err, memalloc;
+ int err;
+ unsigned int noreclaim_flag;
err = get_pages(ns, file, count, pos);
if (err)
return err;
- memalloc = set_memalloc();
+ noreclaim_flag = memalloc_noreclaim_save();
tx = kernel_write(file, buf, count, pos);
- clear_memalloc(memalloc);
+ memalloc_noreclaim_restore(noreclaim_flag);
put_pages(ns);
return tx;
}
--
2.12.2
On Wed 05-04-17 09:46:57, Vlastimil Babka wrote:
> The function __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent
> deadlock during page migration by lock_page() (see the comment in
> __unmap_and_move()). Then it unconditionally clears the flag, which can clear a
> pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a
> problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct
> compaction handling in slowpath"), because direct compation was called only
> after direct reclaim, which was skipped when PF_MEMALLOC flag was set.
>
> Even now it's only a theoretical issue, as the new callsite of
> __alloc_pages_direct_compact() is reached only for costly orders and when
> gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in
> gfp_flags or in_interrupt() is true. There is no such known context, but let's
> play it safe and make __alloc_pages_direct_compact() robust for cases where
> PF_MEMALLOC is already set.
>
> Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
> Reported-by: Andrey Ryabinin <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> Cc: <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/page_alloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3589f8be53be..b84e6ffbe756 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3288,6 +3288,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> enum compact_priority prio, enum compact_result *compact_result)
> {
> struct page *page;
> + unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
>
> if (!order)
> return NULL;
> @@ -3295,7 +3296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> current->flags |= PF_MEMALLOC;
> *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
> prio);
> - current->flags &= ~PF_MEMALLOC;
> + current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
>
> if (*compact_result <= COMPACT_INACTIVE)
> return NULL;
> --
> 2.12.2
--
Michal Hocko
SUSE Labs
On Wed 05-04-17 09:46:59, Vlastimil Babka wrote:
> We now have memalloc_noreclaim_{save,restore} helpers for robust setting and
> clearing of PF_MEMALLOC. Let's convert the code which was using the generic
> tsk_restore_flags(). No functional change.
It would be really great to revisit why those places outside of the mm
proper really need this flag. I know this is a painful exercise but I
wouldn't be surprised if there were abusers there.
> Signed-off-by: Vlastimil Babka <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Lee Duncan <[email protected]>
> Cc: Chris Leech <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> Cc: Eric Dumazet <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> drivers/block/nbd.c | 7 ++++---
> drivers/scsi/iscsi_tcp.c | 7 ++++---
> net/core/dev.c | 7 ++++---
> net/core/sock.c | 7 ++++---
> 4 files changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 03ae72985c79..929fc548c7fb 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -18,6 +18,7 @@
> #include <linux/module.h>
> #include <linux/init.h>
> #include <linux/sched.h>
> +#include <linux/sched/mm.h>
> #include <linux/fs.h>
> #include <linux/bio.h>
> #include <linux/stat.h>
> @@ -210,7 +211,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
> struct socket *sock = nbd->socks[index]->sock;
> int result;
> struct msghdr msg;
> - unsigned long pflags = current->flags;
> + unsigned int noreclaim_flag;
>
> if (unlikely(!sock)) {
> dev_err_ratelimited(disk_to_dev(nbd->disk),
> @@ -221,7 +222,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
>
> msg.msg_iter = *iter;
>
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> do {
> sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
> msg.msg_name = NULL;
> @@ -244,7 +245,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
> *sent += result;
> } while (msg_data_left(&msg));
>
> - tsk_restore_flags(current, pflags, PF_MEMALLOC);
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> return result;
> }
> diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
> index 4228aba1f654..4842fc0e809d 100644
> --- a/drivers/scsi/iscsi_tcp.c
> +++ b/drivers/scsi/iscsi_tcp.c
> @@ -30,6 +30,7 @@
> #include <linux/types.h>
> #include <linux/inet.h>
> #include <linux/slab.h>
> +#include <linux/sched/mm.h>
> #include <linux/file.h>
> #include <linux/blkdev.h>
> #include <linux/delay.h>
> @@ -371,10 +372,10 @@ static inline int iscsi_sw_tcp_xmit_qlen(struct iscsi_conn *conn)
> static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
> {
> struct iscsi_conn *conn = task->conn;
> - unsigned long pflags = current->flags;
> + unsigned int noreclaim_flag;
> int rc = 0;
>
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
>
> while (iscsi_sw_tcp_xmit_qlen(conn)) {
> rc = iscsi_sw_tcp_xmit(conn);
> @@ -387,7 +388,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
> rc = 0;
> }
>
> - tsk_restore_flags(current, pflags, PF_MEMALLOC);
> + memalloc_noreclaim_restore(noreclaim_flag);
> return rc;
> }
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index fde8b3f7136b..e0705a126b24 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -81,6 +81,7 @@
> #include <linux/hash.h>
> #include <linux/slab.h>
> #include <linux/sched.h>
> +#include <linux/sched/mm.h>
> #include <linux/mutex.h>
> #include <linux/string.h>
> #include <linux/mm.h>
> @@ -4227,7 +4228,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
> int ret;
>
> if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
> - unsigned long pflags = current->flags;
> + unsigned int noreclaim_flag;
>
> /*
> * PFMEMALLOC skbs are special, they should
> @@ -4238,9 +4239,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
> * Use PF_MEMALLOC as this saves us from propagating the allocation
> * context down to all allocation sites.
> */
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> ret = __netif_receive_skb_core(skb, true);
> - tsk_restore_flags(current, pflags, PF_MEMALLOC);
> + memalloc_noreclaim_restore(noreclaim_flag);
> } else
> ret = __netif_receive_skb_core(skb, false);
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 392f9b6f96e2..0b2d06b4c308 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -102,6 +102,7 @@
> #include <linux/proc_fs.h>
> #include <linux/seq_file.h>
> #include <linux/sched.h>
> +#include <linux/sched/mm.h>
> #include <linux/timer.h>
> #include <linux/string.h>
> #include <linux/sockios.h>
> @@ -372,14 +373,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);
> int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> {
> int ret;
> - unsigned long pflags = current->flags;
> + unsigned int noreclaim_flag;
>
> /* these should have been dropped before queueing */
> BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
>
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> ret = sk->sk_backlog_rcv(sk, skb);
> - tsk_restore_flags(current, pflags, PF_MEMALLOC);
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> return ret;
> }
> --
> 2.12.2
--
Michal Hocko
SUSE Labs
On Wed 05-04-17 09:46:58, Vlastimil Babka wrote:
> The previous patch has shown that simply setting and clearing PF_MEMALLOC in
> current->flags can result in wrongly clearing a pre-existing PF_MEMALLOC flag
> and potentially lead to recursive reclaim. Let's introduce helpers that support
> proper nesting by saving the previous stat of the flag, similar to the existing
> memalloc_noio_* and memalloc_nofs_* helpers. Convert existing setting/clearing
> of PF_MEMALLOC within mm to the new helpers.
>
> There are no known issues with the converted code, but the change makes it more
> robust.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
One could argue that tsk_restore_flags() could be extended to provide
tsk_set_flags() and use it for all allocation related PF flags. I do not
have a strong opinion on that but explicit API sounds a bit better to me
because is easier to follow (at least for me). If others think that
generic API would be better then I won't have any objections. Anyway
this looks good to me.
Acked-by: Michal Hocko <[email protected]>
> ---
> include/linux/sched/mm.h | 12 ++++++++++++
> mm/page_alloc.c | 11 ++++++-----
> mm/vmscan.c | 17 +++++++++++------
> 3 files changed, 29 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 9daabe138c99..2b24a6974847 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -191,4 +191,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
> current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
> }
>
> +static inline unsigned int memalloc_noreclaim_save(void)
> +{
> + unsigned int flags = current->flags & PF_MEMALLOC;
> + current->flags |= PF_MEMALLOC;
> + return flags;
> +}
> +
> +static inline void memalloc_noreclaim_restore(unsigned int flags)
> +{
> + current->flags = (current->flags & ~PF_MEMALLOC) | flags;
> +}
> +
> #endif /* _LINUX_SCHED_MM_H */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b84e6ffbe756..037e32dccd7a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3288,15 +3288,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> enum compact_priority prio, enum compact_result *compact_result)
> {
> struct page *page;
> - unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
> + unsigned int noreclaim_flag;
>
> if (!order)
> return NULL;
>
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
> prio);
> - current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> if (*compact_result <= COMPACT_INACTIVE)
> return NULL;
> @@ -3443,12 +3443,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
> {
> struct reclaim_state reclaim_state;
> int progress;
> + unsigned int noreclaim_flag;
>
> cond_resched();
>
> /* We now go into synchronous reclaim */
> cpuset_memory_pressure_bump();
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> lockdep_set_current_reclaim_state(gfp_mask);
> reclaim_state.reclaimed_slab = 0;
> current->reclaim_state = &reclaim_state;
> @@ -3458,7 +3459,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
>
> current->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> - current->flags &= ~PF_MEMALLOC;
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> cond_resched();
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 58615bb27f2f..ff63b91a0f48 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2992,6 +2992,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> struct zonelist *zonelist;
> unsigned long nr_reclaimed;
> int nid;
> + unsigned int noreclaim_flag;
> struct scan_control sc = {
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
> @@ -3018,9 +3019,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> sc.gfp_mask,
> sc.reclaim_idx);
>
> - current->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
> - current->flags &= ~PF_MEMALLOC;
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
>
> @@ -3544,8 +3545,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
> struct task_struct *p = current;
> unsigned long nr_reclaimed;
> + unsigned int noreclaim_flag;
>
> - p->flags |= PF_MEMALLOC;
> + noreclaim_flag = memalloc_noreclaim_save();
> lockdep_set_current_reclaim_state(sc.gfp_mask);
> reclaim_state.reclaimed_slab = 0;
> p->reclaim_state = &reclaim_state;
> @@ -3554,7 +3556,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>
> p->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> - p->flags &= ~PF_MEMALLOC;
> + memalloc_noreclaim_restore(noreclaim_flag);
>
> return nr_reclaimed;
> }
> @@ -3719,6 +3721,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> struct task_struct *p = current;
> struct reclaim_state reclaim_state;
> int classzone_idx = gfp_zone(gfp_mask);
> + unsigned int noreclaim_flag;
> struct scan_control sc = {
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
> @@ -3736,7 +3739,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> * and we also need to be able to write out pages for RECLAIM_WRITE
> * and RECLAIM_UNMAP.
> */
> - p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
> + noreclaim_flag = memalloc_noreclaim_save();
> + p->flags |= PF_SWAPWRITE;
> lockdep_set_current_reclaim_state(gfp_mask);
> reclaim_state.reclaimed_slab = 0;
> p->reclaim_state = &reclaim_state;
> @@ -3752,7 +3756,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> }
>
> p->reclaim_state = NULL;
> - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
> + current->flags &= ~PF_SWAPWRITE;
> + memalloc_noreclaim_restore(noreclaim_flag);
> lockdep_clear_current_reclaim_state();
> return sc.nr_reclaimed >= nr_pages;
> }
> --
> 2.12.2
--
Michal Hocko
SUSE Labs
On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
> No functional change.
This one smells like an abuser. Why the hell should read/write path
touch memory reserves at all!
>
> Signed-off-by: Vlastimil Babka <[email protected]>
> Cc: Boris Brezillon <[email protected]>
> Cc: Richard Weinberger <[email protected]>
> ---
> drivers/mtd/nand/nandsim.c | 29 +++++++++--------------------
> 1 file changed, 9 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
> index cef818f535ed..03a0d057bf2f 100644
> --- a/drivers/mtd/nand/nandsim.c
> +++ b/drivers/mtd/nand/nandsim.c
> @@ -40,6 +40,7 @@
> #include <linux/list.h>
> #include <linux/random.h>
> #include <linux/sched.h>
> +#include <linux/sched/mm.h>
> #include <linux/fs.h>
> #include <linux/pagemap.h>
> #include <linux/seq_file.h>
> @@ -1368,31 +1369,18 @@ static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t
> return 0;
> }
>
> -static int set_memalloc(void)
> -{
> - if (current->flags & PF_MEMALLOC)
> - return 0;
> - current->flags |= PF_MEMALLOC;
> - return 1;
> -}
> -
> -static void clear_memalloc(int memalloc)
> -{
> - if (memalloc)
> - current->flags &= ~PF_MEMALLOC;
> -}
> -
> static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos)
> {
> ssize_t tx;
> - int err, memalloc;
> + int err;
> + unsigned int noreclaim_flag;
>
> err = get_pages(ns, file, count, pos);
> if (err)
> return err;
> - memalloc = set_memalloc();
> + noreclaim_flag = memalloc_noreclaim_save();
> tx = kernel_read(file, pos, buf, count);
> - clear_memalloc(memalloc);
> + memalloc_noreclaim_restore(noreclaim_flag);
> put_pages(ns);
> return tx;
> }
> @@ -1400,14 +1388,15 @@ static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_
> static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t pos)
> {
> ssize_t tx;
> - int err, memalloc;
> + int err;
> + unsigned int noreclaim_flag;
>
> err = get_pages(ns, file, count, pos);
> if (err)
> return err;
> - memalloc = set_memalloc();
> + noreclaim_flag = memalloc_noreclaim_save();
> tx = kernel_write(file, buf, count, pos);
> - clear_memalloc(memalloc);
> + memalloc_noreclaim_restore(noreclaim_flag);
> put_pages(ns);
> return tx;
> }
> --
> 2.12.2
--
Michal Hocko
SUSE Labs
Michal,
Am 05.04.2017 um 13:31 schrieb Michal Hocko:
> On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
>> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
>> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
>> No functional change.
>
> This one smells like an abuser. Why the hell should read/write path
> touch memory reserves at all!
Could be. Let's ask Adrian, AFAIK he wrote that code.
Adrian, can you please clarify why nandsim needs to play with PF_MEMALLOC?
Thanks,
//richard
On 04/05/2017 01:36 PM, Richard Weinberger wrote:
> Michal,
>
> Am 05.04.2017 um 13:31 schrieb Michal Hocko:
>> On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
>>> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
>>> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
>>> No functional change.
>>
>> This one smells like an abuser. Why the hell should read/write path
>> touch memory reserves at all!
>
> Could be. Let's ask Adrian, AFAIK he wrote that code.
> Adrian, can you please clarify why nandsim needs to play with PF_MEMALLOC?
I was thinking about it and concluded that since the simulator can be
used as a block device where reclaimed pages go to, writing the data out
is a memalloc operation. Then reading can be called as part of r-m-w
cycle, so reading as well. But it would be great if somebody more
knowledgeable confirmed this.
> Thanks,
> //richard
>
On 04/05/2017 10:46 AM, Vlastimil Babka wrote:
> The function __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent
> deadlock during page migration by lock_page() (see the comment in
> __unmap_and_move()). Then it unconditionally clears the flag, which can clear a
> pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a
> problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct
> compaction handling in slowpath"), because direct compation was called only
> after direct reclaim, which was skipped when PF_MEMALLOC flag was set.
>
> Even now it's only a theoretical issue, as the new callsite of
> __alloc_pages_direct_compact() is reached only for costly orders and when
> gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in
is false
> gfp_flags or in_interrupt() is true. There is no such known context, but let's
> play it safe and make __alloc_pages_direct_compact() robust for cases where
> PF_MEMALLOC is already set.
>
> Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
> Reported-by: Andrey Ryabinin <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> Cc: <[email protected]>
> ---
> mm/page_alloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3589f8be53be..b84e6ffbe756 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3288,6 +3288,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> enum compact_priority prio, enum compact_result *compact_result)
> {
> struct page *page;
> + unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
>
> if (!order)
> return NULL;
> @@ -3295,7 +3296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> current->flags |= PF_MEMALLOC;
> *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
> prio);
> - current->flags &= ~PF_MEMALLOC;
> + current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
Perhaps this would look better:
tsk_restore_flags(current, noreclaim_flag, PF_MEMALLOC);
?
> if (*compact_result <= COMPACT_INACTIVE)
> return NULL;
>
On Wed 05-04-17 13:39:16, Vlastimil Babka wrote:
> On 04/05/2017 01:36 PM, Richard Weinberger wrote:
> > Michal,
> >
> > Am 05.04.2017 um 13:31 schrieb Michal Hocko:
> >> On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
> >>> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
> >>> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
> >>> No functional change.
> >>
> >> This one smells like an abuser. Why the hell should read/write path
> >> touch memory reserves at all!
> >
> > Could be. Let's ask Adrian, AFAIK he wrote that code.
> > Adrian, can you please clarify why nandsim needs to play with PF_MEMALLOC?
>
> I was thinking about it and concluded that since the simulator can be
> used as a block device where reclaimed pages go to, writing the data out
> is a memalloc operation. Then reading can be called as part of r-m-w
> cycle, so reading as well. But it would be great if somebody more
> knowledgeable confirmed this.
then this deserves a big fat comment explaining all the details,
including how the complete depletion of reserves is prevented.
--
Michal Hocko
SUSE Labs
On 05/04/17 14:39, Vlastimil Babka wrote:
> On 04/05/2017 01:36 PM, Richard Weinberger wrote:
>> Michal,
>>
>> Am 05.04.2017 um 13:31 schrieb Michal Hocko:
>>> On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
>>>> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
>>>> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
>>>> No functional change.
>>>
>>> This one smells like an abuser. Why the hell should read/write path
>>> touch memory reserves at all!
>>
>> Could be. Let's ask Adrian, AFAIK he wrote that code.
>> Adrian, can you please clarify why nandsim needs to play with PF_MEMALLOC?
>
> I was thinking about it and concluded that since the simulator can be
> used as a block device where reclaimed pages go to, writing the data out
> is a memalloc operation. Then reading can be called as part of r-m-w
> cycle, so reading as well.
IIRC it was to avoid getting stuck with nandsim waiting on memory reclaim
and memory reclaim waiting on nandsim.
On Wed, Apr 05, 2017 at 01:30:31PM +0200, Michal Hocko wrote:
> On Wed 05-04-17 09:46:59, Vlastimil Babka wrote:
> > We now have memalloc_noreclaim_{save,restore} helpers for robust setting and
> > clearing of PF_MEMALLOC. Let's convert the code which was using the generic
> > tsk_restore_flags(). No functional change.
>
> It would be really great to revisit why those places outside of the mm
> proper really need this flag. I know this is a painful exercise but I
> wouldn't be surprised if there were abusers there.
[...]
> > ---
> > drivers/block/nbd.c | 7 ++++---
> > drivers/scsi/iscsi_tcp.c | 7 ++++---
> > net/core/dev.c | 7 ++++---
> > net/core/sock.c | 7 ++++---
> > 4 files changed, 16 insertions(+), 12 deletions(-)
These were all done to make swapping over network safe. The idea is that
if a socket has SOCK_MEMALLOC set, incoming packets for that socket can
access PFMEMALLOC reserves (whereas other sockets cannot); this all in
the hope that one packe destined to that socket will contain the TCP ACK
that confirms the swapout was successful and we can now release RAM
pages for other processes.
I don't know whether they need the PF_MEMALLOC flag specifically (not a
kernel hacker), but they do need to interact with it at any rate.
--
< ron> I mean, the main *practical* problem with C++, is there's like a dozen
people in the world who think they really understand all of its rules,
and pretty much all of them are just lying to themselves too.
-- #debian-devel, OFTC, 2016-02-12
On Thu 06-04-17 09:33:44, Adrian Hunter wrote:
> On 05/04/17 14:39, Vlastimil Babka wrote:
> > On 04/05/2017 01:36 PM, Richard Weinberger wrote:
> >> Michal,
> >>
> >> Am 05.04.2017 um 13:31 schrieb Michal Hocko:
> >>> On Wed 05-04-17 09:47:00, Vlastimil Babka wrote:
> >>>> Nandsim has own functions set_memalloc() and clear_memalloc() for robust
> >>>> setting and clearing of PF_MEMALLOC. Replace them by the new generic helpers.
> >>>> No functional change.
> >>>
> >>> This one smells like an abuser. Why the hell should read/write path
> >>> touch memory reserves at all!
> >>
> >> Could be. Let's ask Adrian, AFAIK he wrote that code.
> >> Adrian, can you please clarify why nandsim needs to play with PF_MEMALLOC?
> >
> > I was thinking about it and concluded that since the simulator can be
> > used as a block device where reclaimed pages go to, writing the data out
> > is a memalloc operation. Then reading can be called as part of r-m-w
> > cycle, so reading as well.
>
> IIRC it was to avoid getting stuck with nandsim waiting on memory reclaim
> and memory reclaim waiting on nandsim.
I've got lost in the indirection. Could you describe how would reclaim
get stuck waiting on these paths please?
--
Michal Hocko
SUSE Labs
On Thu, Apr 06, 2017 at 08:38:10AM +0200, Wouter Verhelst wrote:
> On Wed, Apr 05, 2017 at 01:30:31PM +0200, Michal Hocko wrote:
> > On Wed 05-04-17 09:46:59, Vlastimil Babka wrote:
> > > We now have memalloc_noreclaim_{save,restore} helpers for robust setting and
> > > clearing of PF_MEMALLOC. Let's convert the code which was using the generic
> > > tsk_restore_flags(). No functional change.
> >
> > It would be really great to revisit why those places outside of the mm
> > proper really need this flag. I know this is a painful exercise but I
> > wouldn't be surprised if there were abusers there.
> [...]
> > > ---
> > > drivers/block/nbd.c | 7 ++++---
> > > drivers/scsi/iscsi_tcp.c | 7 ++++---
> > > net/core/dev.c | 7 ++++---
> > > net/core/sock.c | 7 ++++---
> > > 4 files changed, 16 insertions(+), 12 deletions(-)
>
> These were all done to make swapping over network safe. The idea is that
> if a socket has SOCK_MEMALLOC set, incoming packets for that socket can
> access PFMEMALLOC reserves (whereas other sockets cannot); this all in
> the hope that one packe destined to that socket will contain the TCP ACK
> that confirms the swapout was successful and we can now release RAM
> pages for other processes.
>
> I don't know whether they need the PF_MEMALLOC flag specifically (not a
> kernel hacker), but they do need to interact with it at any rate.
>
At the time it was required to get access to emergency reserves so swapping
can continue. The flip side is that the memory is then protected so pages
allocated from emergency reserves are not used for network traffic that
is not involved with swap. This means that under heavy swap load, it was
perfectly possible for unrelated traffic to get dropped for quite some
time.
--
Mel Gorman
SUSE Labs
On April 05, 2017 3:47 PM Vlastimil Babka wrote:
>
> The function __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent
> deadlock during page migration by lock_page() (see the comment in
> __unmap_and_move()). Then it unconditionally clears the flag, which can clear a
> pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a
> problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct
> compaction handling in slowpath"), because direct compation was called only
> after direct reclaim, which was skipped when PF_MEMALLOC flag was set.
>
> Even now it's only a theoretical issue, as the new callsite of
> __alloc_pages_direct_compact() is reached only for costly orders and when
> gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in
> gfp_flags or in_interrupt() is true. There is no such known context, but let's
> play it safe and make __alloc_pages_direct_compact() robust for cases where
> PF_MEMALLOC is already set.
>
> Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
> Reported-by: Andrey Ryabinin <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> Cc: <[email protected]>
> ---
Acked-by: Hillf Danton <[email protected]>
> mm/page_alloc.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3589f8be53be..b84e6ffbe756 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3288,6 +3288,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> enum compact_priority prio, enum compact_result *compact_result)
> {
> struct page *page;
> + unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
>
> if (!order)
> return NULL;
> @@ -3295,7 +3296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
> current->flags |= PF_MEMALLOC;
> *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
> prio);
> - current->flags &= ~PF_MEMALLOC;
> + current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
>
> if (*compact_result <= COMPACT_INACTIVE)
> return NULL;
> --
> 2.12.2
On April 05, 2017 3:47 PM Vlastimil Babka wrote:
>
> The previous patch has shown that simply setting and clearing PF_MEMALLOC in
> current->flags can result in wrongly clearing a pre-existing PF_MEMALLOC flag
> and potentially lead to recursive reclaim. Let's introduce helpers that support
> proper nesting by saving the previous stat of the flag, similar to the existing
> memalloc_noio_* and memalloc_nofs_* helpers. Convert existing setting/clearing
> of PF_MEMALLOC within mm to the new helpers.
>
> There are no known issues with the converted code, but the change makes it more
> robust.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> ---
Acked-by: Hillf Danton <[email protected]>
On 04/05/2017 01:40 PM, Andrey Ryabinin wrote:
> On 04/05/2017 10:46 AM, Vlastimil Babka wrote:
>> The function __alloc_pages_direct_compact() sets PF_MEMALLOC to prevent
>> deadlock during page migration by lock_page() (see the comment in
>> __unmap_and_move()). Then it unconditionally clears the flag, which can clear a
>> pre-existing PF_MEMALLOC flag and result in recursive reclaim. This was not a
>> problem until commit a8161d1ed609 ("mm, page_alloc: restructure direct
>> compaction handling in slowpath"), because direct compation was called only
>> after direct reclaim, which was skipped when PF_MEMALLOC flag was set.
>>
>> Even now it's only a theoretical issue, as the new callsite of
>> __alloc_pages_direct_compact() is reached only for costly orders and when
>> gfp_pfmemalloc_allowed() is true, which means either __GFP_NOMEMALLOC is in
> is false
>
>> gfp_flags or in_interrupt() is true. There is no such known context, but let's
>> play it safe and make __alloc_pages_direct_compact() robust for cases where
>> PF_MEMALLOC is already set.
>>
>> Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
>> Reported-by: Andrey Ryabinin <[email protected]>
>> Signed-off-by: Vlastimil Babka <[email protected]>
>> Cc: <[email protected]>
>> ---
>> mm/page_alloc.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 3589f8be53be..b84e6ffbe756 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3288,6 +3288,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>> enum compact_priority prio, enum compact_result *compact_result)
>> {
>> struct page *page;
>> + unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
>>
>> if (!order)
>> return NULL;
>> @@ -3295,7 +3296,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
>> current->flags |= PF_MEMALLOC;
>> *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
>> prio);
>> - current->flags &= ~PF_MEMALLOC;
>> + current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
>
> Perhaps this would look better:
>
> tsk_restore_flags(current, noreclaim_flag, PF_MEMALLOC);
>
> ?
Well, I didn't care much considering this is for stable only, and patch 2/4
rewrites this to the new api.
>> if (*compact_result <= COMPACT_INACTIVE)
>> return NULL;
>>
>