These fields are used for dumping info about allocation that triggered
OOM. For cgroup this information doesn't make much sense, because OOM
killer is always invoked from page fault handler. It isn't worth the
space these fields occupy in the task_struct.
Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/sched.h | 2 --
mm/memcontrol.c | 14 +++++---------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba8d8355c93a..626f5da5c43e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1839,8 +1839,6 @@ struct task_struct {
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
- gfp_t memcg_oom_gfp_mask;
- int memcg_oom_order;
/* number of pages to reclaim on returning to userland */
unsigned int memcg_nr_pages_over_high;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36db05fa8acb..a217b1374c32 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1232,14 +1232,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
return limit;
}
-static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
- int order)
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg)
{
struct oom_control oc = {
.zonelist = NULL,
.nodemask = NULL,
- .gfp_mask = gfp_mask,
- .order = order,
+ .gfp_mask = GFP_KERNEL,
+ .order = 0,
};
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
@@ -1605,8 +1604,6 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
*/
css_get(&memcg->css);
current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
}
/**
@@ -1656,8 +1653,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (locked && !memcg->oom_kill_disable) {
mem_cgroup_unmark_under_oom(memcg);
finish_wait(&memcg_oom_waitq, &owait.wait);
- mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
- current->memcg_oom_order);
+ mem_cgroup_out_of_memory(memcg);
} else {
schedule();
mem_cgroup_unmark_under_oom(memcg);
@@ -5063,7 +5059,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
}
mem_cgroup_events(memcg, MEMCG_OOM, 1);
- if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ if (!mem_cgroup_out_of_memory(memcg))
break;
}
--
2.1.4
On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> These fields are used for dumping info about allocation that triggered
> OOM. For cgroup this information doesn't make much sense, because OOM
> killer is always invoked from page fault handler.
The oom killer is indeed invoked in a different context but why printing
the original mask and order doesn't make any sense? Doesn't it help to
see that the reclaim has failed because of GFP_NOFS?
> It isn't worth the
> space these fields occupy in the task_struct.
struct mem_cgroup * memcg_in_oom; /* 8456 8 */
gfp_t memcg_oom_gfp_mask; /* 8464 4 */
int memcg_oom_order; /* 8468 4 */
unsigned int memcg_nr_pages_over_high; /* 8472 4 */
/* XXX 4 bytes hole, try to pack */
struct uprobe_task * utask; /* 8480 8 */
int pagefault_disabled; /* 8488 4 */
/* XXX 20 bytes hole, try to pack */
/* --- cacheline 133 boundary (8512 bytes) --- */
struct thread_struct thread; /* 8512 4352 */
/* --- cacheline 201 boundary (12864 bytes) --- */
/* size: 12864, cachelines: 201, members: 185 */
vs.
struct mem_cgroup * memcg_in_oom; /* 8456 8 */
unsigned int memcg_nr_pages_over_high; /* 8464 4 */
/* XXX 4 bytes hole, try to pack */
struct uprobe_task * utask; /* 8472 8 */
int pagefault_disabled; /* 8480 4 */
/* XXX 4 bytes hole, try to pack */
struct task_struct * oom_reaper_list; /* 8488 8 */
/* XXX 16 bytes hole, try to pack */
/* --- cacheline 133 boundary (8512 bytes) --- */
struct thread_struct thread; /* 8512 4352 */
/* --- cacheline 201 boundary (12864 bytes) --- */
/* size: 12864, cachelines: 201, members: 184 */
So it doesn't even seem to save any space in the config I am using. Does
it shrink the size of the structure for you?
>
> Signed-off-by: Vladimir Davydov <[email protected]>
> ---
> include/linux/sched.h | 2 --
> mm/memcontrol.c | 14 +++++---------
> 2 files changed, 5 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ba8d8355c93a..626f5da5c43e 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1839,8 +1839,6 @@ struct task_struct {
> #endif
> #ifdef CONFIG_MEMCG
> struct mem_cgroup *memcg_in_oom;
> - gfp_t memcg_oom_gfp_mask;
> - int memcg_oom_order;
>
> /* number of pages to reclaim on returning to userland */
> unsigned int memcg_nr_pages_over_high;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 36db05fa8acb..a217b1374c32 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1232,14 +1232,13 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> return limit;
> }
>
> -static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> - int order)
> +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg)
> {
> struct oom_control oc = {
> .zonelist = NULL,
> .nodemask = NULL,
> - .gfp_mask = gfp_mask,
> - .order = order,
> + .gfp_mask = GFP_KERNEL,
> + .order = 0,
> };
> struct mem_cgroup *iter;
> unsigned long chosen_points = 0;
> @@ -1605,8 +1604,6 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
> */
> css_get(&memcg->css);
> current->memcg_in_oom = memcg;
> - current->memcg_oom_gfp_mask = mask;
> - current->memcg_oom_order = order;
> }
>
> /**
> @@ -1656,8 +1653,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
> if (locked && !memcg->oom_kill_disable) {
> mem_cgroup_unmark_under_oom(memcg);
> finish_wait(&memcg_oom_waitq, &owait.wait);
> - mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
> - current->memcg_oom_order);
> + mem_cgroup_out_of_memory(memcg);
> } else {
> schedule();
> mem_cgroup_unmark_under_oom(memcg);
> @@ -5063,7 +5059,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> }
>
> mem_cgroup_events(memcg, MEMCG_OOM, 1);
> - if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
> + if (!mem_cgroup_out_of_memory(memcg))
> break;
> }
>
> --
> 2.1.4
--
Michal Hocko
SUSE Labs
On Fri, Mar 11, 2016 at 12:54:50PM +0100, Michal Hocko wrote:
> On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> > These fields are used for dumping info about allocation that triggered
> > OOM. For cgroup this information doesn't make much sense, because OOM
> > killer is always invoked from page fault handler.
>
> The oom killer is indeed invoked in a different context but why printing
> the original mask and order doesn't make any sense? Doesn't it help to
> see that the reclaim has failed because of GFP_NOFS?
I don't see how this can be helpful. How would you use it?
Wouldn't it be better to print err msg in try_charge anyway?
...
> So it doesn't even seem to save any space in the config I am using. Does
> it shrink the size of the structure for you?
There are several hundred bytes left in task_struct for its size to
exceed 2 pages threshold and hence increase slab order, but it doesn't
mean we don't need to be conservative and do our best to spare some
space for future users that can't live w/o adding new fields.
Thanks,
Vladimir
On Fri 11-03-16 15:39:00, Vladimir Davydov wrote:
> On Fri, Mar 11, 2016 at 12:54:50PM +0100, Michal Hocko wrote:
> > On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> > > These fields are used for dumping info about allocation that triggered
> > > OOM. For cgroup this information doesn't make much sense, because OOM
> > > killer is always invoked from page fault handler.
> >
> > The oom killer is indeed invoked in a different context but why printing
> > the original mask and order doesn't make any sense? Doesn't it help to
> > see that the reclaim has failed because of GFP_NOFS?
>
> I don't see how this can be helpful. How would you use it?
If we start seeing GFP_NOFS triggered OOMs we might be enforced to
rethink our current strategy to ignore this charge context for OOM.
> Wouldn't it be better to print err msg in try_charge anyway?
Wouldn't that lead to excessive amount of logged messages?
> ...
> > So it doesn't even seem to save any space in the config I am using. Does
> > it shrink the size of the structure for you?
>
> There are several hundred bytes left in task_struct for its size to
> exceed 2 pages threshold and hence increase slab order, but it doesn't
> mean we don't need to be conservative and do our best to spare some
> space for future users that can't live w/o adding new fields.
I do agree that we should hard to make task_struct as small as possible
but now you are throwing a potentially useful information, replace it by
something that might be misleading and do not shrink the struct size.
This doesn't sound like an universal win to me. The situation would be
much more different if this was the last few bytes which gets us to a
higher order of course.
--
Michal Hocko
SUSE Labs
On Fri, Mar 11, 2016 at 01:51:05PM +0100, Michal Hocko wrote:
> On Fri 11-03-16 15:39:00, Vladimir Davydov wrote:
> > On Fri, Mar 11, 2016 at 12:54:50PM +0100, Michal Hocko wrote:
> > > On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> > > > These fields are used for dumping info about allocation that triggered
> > > > OOM. For cgroup this information doesn't make much sense, because OOM
> > > > killer is always invoked from page fault handler.
> > >
> > > The oom killer is indeed invoked in a different context but why printing
> > > the original mask and order doesn't make any sense? Doesn't it help to
> > > see that the reclaim has failed because of GFP_NOFS?
> >
> > I don't see how this can be helpful. How would you use it?
>
> If we start seeing GFP_NOFS triggered OOMs we might be enforced to
> rethink our current strategy to ignore this charge context for OOM.
IMO the fact that a lot of OOMs are triggered by GFP_NOFS allocations
can't be a good enough reason to reconsider OOM strategy. We need to
know what kind of allocation fails anyway, and the current OOM dump
gives us no clue about that.
Besides, what if OOM was triggered by GFP_NOFS by pure chance, i.e. it
would have been triggered by GFP_KERNEL if it had happened at that time?
IMO it's just confusing.
>
> > Wouldn't it be better to print err msg in try_charge anyway?
>
> Wouldn't that lead to excessive amount of logged messages?
We could ratelimit these messages. Slab charge failures are already
reported to dmesg (see ___slab_alloc -> slab_out_of_memory) and nobody's
complained so far. Are there any non-slab GFP_NOFS allocations charged
to memcg?
Thanks,
Vladimir
On Fri 11-03-16 16:45:34, Vladimir Davydov wrote:
> On Fri, Mar 11, 2016 at 01:51:05PM +0100, Michal Hocko wrote:
> > On Fri 11-03-16 15:39:00, Vladimir Davydov wrote:
> > > On Fri, Mar 11, 2016 at 12:54:50PM +0100, Michal Hocko wrote:
> > > > On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> > > > > These fields are used for dumping info about allocation that triggered
> > > > > OOM. For cgroup this information doesn't make much sense, because OOM
> > > > > killer is always invoked from page fault handler.
> > > >
> > > > The oom killer is indeed invoked in a different context but why printing
> > > > the original mask and order doesn't make any sense? Doesn't it help to
> > > > see that the reclaim has failed because of GFP_NOFS?
> > >
> > > I don't see how this can be helpful. How would you use it?
> >
> > If we start seeing GFP_NOFS triggered OOMs we might be enforced to
> > rethink our current strategy to ignore this charge context for OOM.
>
> IMO the fact that a lot of OOMs are triggered by GFP_NOFS allocations
> can't be a good enough reason to reconsider OOM strategy.
What I meant was that the global OOM doesn't trigger OOM got !__GFP_FS
while we do in the memcg charge path.
> We need to
> know what kind of allocation fails anyway, and the current OOM dump
> gives us no clue about that.
We do print gfp_mask now so we know what was the charging context.
> Besides, what if OOM was triggered by GFP_NOFS by pure chance, i.e. it
> would have been triggered by GFP_KERNEL if it had happened at that time?
Not really. GFP_KERNEL would allow to invoke some shrinkers which are
GFP_NOFS incopatible.
> IMO it's just confusing.
>
> >
> > > Wouldn't it be better to print err msg in try_charge anyway?
> >
> > Wouldn't that lead to excessive amount of logged messages?
>
> We could ratelimit these messages. Slab charge failures are already
> reported to dmesg (see ___slab_alloc -> slab_out_of_memory) and nobody's
> complained so far. Are there any non-slab GFP_NOFS allocations charged
> to memcg?
I believe there might be some coming from FS via add_to_page_cache_lru.
Especially when their mapping gfp_mask clears __GFP_FS. I haven't
checked the code deeper but some of those might be called from the page
fault path and trigger memcg OOM. I would have to look closer.
--
Michal Hocko
SUSE Labs
On Fri, Mar 11, 2016 at 03:30:31PM +0100, Michal Hocko wrote:
> On Fri 11-03-16 16:45:34, Vladimir Davydov wrote:
> > On Fri, Mar 11, 2016 at 01:51:05PM +0100, Michal Hocko wrote:
> > > On Fri 11-03-16 15:39:00, Vladimir Davydov wrote:
> > > > On Fri, Mar 11, 2016 at 12:54:50PM +0100, Michal Hocko wrote:
> > > > > On Fri 11-03-16 13:12:47, Vladimir Davydov wrote:
> > > > > > These fields are used for dumping info about allocation that triggered
> > > > > > OOM. For cgroup this information doesn't make much sense, because OOM
> > > > > > killer is always invoked from page fault handler.
> > > > >
> > > > > The oom killer is indeed invoked in a different context but why printing
> > > > > the original mask and order doesn't make any sense? Doesn't it help to
> > > > > see that the reclaim has failed because of GFP_NOFS?
> > > >
> > > > I don't see how this can be helpful. How would you use it?
> > >
> > > If we start seeing GFP_NOFS triggered OOMs we might be enforced to
> > > rethink our current strategy to ignore this charge context for OOM.
> >
> > IMO the fact that a lot of OOMs are triggered by GFP_NOFS allocations
> > can't be a good enough reason to reconsider OOM strategy.
>
> What I meant was that the global OOM doesn't trigger OOM got !__GFP_FS
> while we do in the memcg charge path.
OK, missed your point, sorry.
>
> > We need to
> > know what kind of allocation fails anyway, and the current OOM dump
> > gives us no clue about that.
>
> We do print gfp_mask now so we know what was the charging context.
>
> > Besides, what if OOM was triggered by GFP_NOFS by pure chance, i.e. it
> > would have been triggered by GFP_KERNEL if it had happened at that time?
>
> Not really. GFP_KERNEL would allow to invoke some shrinkers which are
> GFP_NOFS incopatible.
Can't a GFP_NOFS allocation happen when there is no shrinkable objects
to drop so that there's no real difference between GFP_KERNEL and
GFP_NOFS?
>
> > IMO it's just confusing.
> >
> > >
> > > > Wouldn't it be better to print err msg in try_charge anyway?
> > >
> > > Wouldn't that lead to excessive amount of logged messages?
> >
> > We could ratelimit these messages. Slab charge failures are already
> > reported to dmesg (see ___slab_alloc -> slab_out_of_memory) and nobody's
> > complained so far. Are there any non-slab GFP_NOFS allocations charged
> > to memcg?
>
> I believe there might be some coming from FS via add_to_page_cache_lru.
> Especially when their mapping gfp_mask clears __GFP_FS. I haven't
> checked the code deeper but some of those might be called from the page
> fault path and trigger memcg OOM. I would have to look closer.
If you think this warning is really a must have, and you don't like to
warn about every charge failure, may be we could just print info about
allocation that triggered OOM right in mem_cgroup_oom, like the code
below does? I think it would be more-or-less equivalent to what we have
now except it wouldn't require storing gfp_mask on task_struct.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a217b1374c32..d8e130d14f5d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1604,6 +1604,8 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
*/
css_get(&memcg->css);
current->memcg_in_oom = memcg;
+
+ pr_warn("Process ... triggered OOM in memcg ... gfp ...\n");
}
/**
On Fri 11-03-16 18:02:24, Vladimir Davydov wrote:
> On Fri, Mar 11, 2016 at 03:30:31PM +0100, Michal Hocko wrote:
[...]
> > Not really. GFP_KERNEL would allow to invoke some shrinkers which are
> > GFP_NOFS incopatible.
>
> Can't a GFP_NOFS allocation happen when there is no shrinkable objects
> to drop so that there's no real difference between GFP_KERNEL and
> GFP_NOFS?
Yes it can and we do not handle that case even in the global case.
[...]
> > > We could ratelimit these messages. Slab charge failures are already
> > > reported to dmesg (see ___slab_alloc -> slab_out_of_memory) and nobody's
> > > complained so far. Are there any non-slab GFP_NOFS allocations charged
> > > to memcg?
> >
> > I believe there might be some coming from FS via add_to_page_cache_lru.
> > Especially when their mapping gfp_mask clears __GFP_FS. I haven't
> > checked the code deeper but some of those might be called from the page
> > fault path and trigger memcg OOM. I would have to look closer.
>
> If you think this warning is really a must have, and you don't like to
> warn about every charge failure, may be we could just print info about
> allocation that triggered OOM right in mem_cgroup_oom, like the code
> below does? I think it would be more-or-less equivalent to what we have
> now except it wouldn't require storing gfp_mask on task_struct.
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a217b1374c32..d8e130d14f5d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1604,6 +1604,8 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
> */
> css_get(&memcg->css);
> current->memcg_in_oom = memcg;
> +
> + pr_warn("Process ... triggered OOM in memcg ... gfp ...\n");
Hmm, that could lead to intermixed oom reports and matching the failure
to the particular report would be slighltly harder. But I guess it would
be acceptable if it can help to shrink the task_struct in the end. There
are people (google at least) who rely on the oom reports so I would
asked them if they are OK with that. I do not see any obvious issues
with this.
> }
>
> /**
--
Michal Hocko
SUSE Labs
On Fri, Mar 11, 2016 at 04:47:04PM +0100, Michal Hocko wrote:
> On Fri 11-03-16 18:02:24, Vladimir Davydov wrote:
> > On Fri, Mar 11, 2016 at 03:30:31PM +0100, Michal Hocko wrote:
> [...]
> > > Not really. GFP_KERNEL would allow to invoke some shrinkers which are
> > > GFP_NOFS incopatible.
> >
> > Can't a GFP_NOFS allocation happen when there is no shrinkable objects
> > to drop so that there's no real difference between GFP_KERNEL and
> > GFP_NOFS?
>
> Yes it can and we do not handle that case even in the global case.
>
> [...]
> > > > We could ratelimit these messages. Slab charge failures are already
> > > > reported to dmesg (see ___slab_alloc -> slab_out_of_memory) and nobody's
> > > > complained so far. Are there any non-slab GFP_NOFS allocations charged
> > > > to memcg?
> > >
> > > I believe there might be some coming from FS via add_to_page_cache_lru.
> > > Especially when their mapping gfp_mask clears __GFP_FS. I haven't
> > > checked the code deeper but some of those might be called from the page
> > > fault path and trigger memcg OOM. I would have to look closer.
> >
> > If you think this warning is really a must have, and you don't like to
> > warn about every charge failure, may be we could just print info about
> > allocation that triggered OOM right in mem_cgroup_oom, like the code
> > below does? I think it would be more-or-less equivalent to what we have
> > now except it wouldn't require storing gfp_mask on task_struct.
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index a217b1374c32..d8e130d14f5d 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1604,6 +1604,8 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
> > */
> > css_get(&memcg->css);
> > current->memcg_in_oom = memcg;
> > +
> > + pr_warn("Process ... triggered OOM in memcg ... gfp ...\n");
>
> Hmm, that could lead to intermixed oom reports and matching the failure
> to the particular report would be slighltly harder. But I guess it would
> be acceptable if it can help to shrink the task_struct in the end. There
> are people (google at least) who rely on the oom reports so I would
> asked them if they are OK with that. I do not see any obvious issues
> with this.
OK, I'll send v2 then.
Thanks a lot for your feedback.