hi Rafael,
on s390 we have a reproduceable testcase where, after all devices were
suspended, a memory allocation results in disk IO. I know that this is
similar to the current discussion about magically changing the gpf mask,
but in our case the related allocation is triggered not by a device driver
but directly by hibernation_snapshot. The call chain looks like this:
STACK:
0 schedule+1796 [0x5a7af0]
1 io_schedule+98 [0x5a82ce]
2 sync_page_killable+4 [0x1ec424]
3 __wait_on_bit+204 [0x5a8bc4]
4 add_to_page_cache_locked+2 [0x1ec766]
5 shrink_page_list+2372 [0x1fc5b0]
6 shrink_list+2496 [0x1fd02c]
7 shrink_zone+932 [0x1fd3e0]
8 try_to_free_pages+668 [0x1fe4bc]
9 __alloc_pages_nodemask+1346 [0x1f5056]
10 __get_free_pages+76 [0x1f52dc]
11 __build_sched_domains+60 [0x144f98]
12 partition_sched_domains+696 [0x145dcc]
13 update_sched_domains+100 [0x146104]
14 notifier_call_chain+166 [0x5ae112]
15 raw_notifier_call_chain+44 [0x1800c4]
16 _cpu_down+586 [0x59f212]
17 disable_nonboot_cpus+354 [0x155ad2]
18 hibernation_snapshot+324 [0x1a7938]
19 hibernate+304 [0x1a7bcc]
20 state_store+130 [0x1a645e]
21 sysfs_write_file+264 [0x2b551c]
22 vfs_write+190 [0x23f98a]
23 sys_write+100 [0x23fb50]
24 sysc_noemu+16 [0x118ff6]
a possible fix would be to call disable_nonboot_cpus before suspending the
devices..
NOTE: this affects pm debug modes
---
Since disable_nonboot_cpus triggers memory allocations we should call
it before suspending the devices in hibernation_snapshot.
Signed-off-by: Sebastian Ott <[email protected]>
---
kernel/power/hibernate.c | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -229,13 +229,9 @@ static int create_image(int platform_mod
}
error = platform_pre_snapshot(platform_mode);
- if (error || hibernation_test(TEST_PLATFORM))
- goto Platform_finish;
-
- error = disable_nonboot_cpus();
- if (error || hibernation_test(TEST_CPUS)
+ if (error || hibernation_test(TEST_PLATFORM)
|| hibernation_testmode(HIBERNATION_TEST))
- goto Enable_cpus;
+ goto Platform_finish;
local_irq_disable();
@@ -269,9 +265,6 @@ static int create_image(int platform_mod
Enable_irqs:
local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
-
Platform_finish:
platform_finish(platform_mode);
@@ -303,6 +296,10 @@ int hibernation_snapshot(int platform_mo
if (error)
goto Close;
+ error = disable_nonboot_cpus();
+ if (error || hibernation_test(TEST_CPUS))
+ goto Enable_cpus;
+
suspend_console();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
@@ -322,6 +319,8 @@ int hibernation_snapshot(int platform_mo
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
resume_console();
+ Enable_cpus:
+ enable_nonboot_cpus();
Close:
platform_end(platform_mode);
return error;
On Friday 22 January 2010, Sebastian Ott wrote:
> hi Rafael,
>
> on s390 we have a reproduceable testcase where, after all devices were
> suspended, a memory allocation results in disk IO. I know that this is
> similar to the current discussion about magically changing the gpf mask,
> but in our case the related allocation is triggered not by a device driver
> but directly by hibernation_snapshot. The call chain looks like this:
>
> STACK:
> 0 schedule+1796 [0x5a7af0]
> 1 io_schedule+98 [0x5a82ce]
> 2 sync_page_killable+4 [0x1ec424]
> 3 __wait_on_bit+204 [0x5a8bc4]
> 4 add_to_page_cache_locked+2 [0x1ec766]
> 5 shrink_page_list+2372 [0x1fc5b0]
> 6 shrink_list+2496 [0x1fd02c]
> 7 shrink_zone+932 [0x1fd3e0]
> 8 try_to_free_pages+668 [0x1fe4bc]
> 9 __alloc_pages_nodemask+1346 [0x1f5056]
> 10 __get_free_pages+76 [0x1f52dc]
> 11 __build_sched_domains+60 [0x144f98]
> 12 partition_sched_domains+696 [0x145dcc]
> 13 update_sched_domains+100 [0x146104]
> 14 notifier_call_chain+166 [0x5ae112]
> 15 raw_notifier_call_chain+44 [0x1800c4]
> 16 _cpu_down+586 [0x59f212]
> 17 disable_nonboot_cpus+354 [0x155ad2]
> 18 hibernation_snapshot+324 [0x1a7938]
> 19 hibernate+304 [0x1a7bcc]
> 20 state_store+130 [0x1a645e]
> 21 sysfs_write_file+264 [0x2b551c]
> 22 vfs_write+190 [0x23f98a]
> 23 sys_write+100 [0x23fb50]
> 24 sysc_noemu+16 [0x118ff6]
>
> a possible fix would be to call disable_nonboot_cpus before suspending the
> devices..
This is going against the changes attempting to speed-up suspend and resume,
such as the asynchronous suspend/resume patchset, so I don't agree with it.
The real solution would be to remove the memory allocations from the
_cpu_down() call path.
BTW, this is one of the cases I and Ben are talking about where it's not
practical to rework the code just to avoid memory allocation problems during
suspend/resume.
Rafael
On Friday 22 January 2010, Rafael J. Wysocki wrote:
> On Friday 22 January 2010, Sebastian Ott wrote:
> > hi Rafael,
> >
> > on s390 we have a reproduceable testcase where, after all devices were
> > suspended, a memory allocation results in disk IO. I know that this is
> > similar to the current discussion about magically changing the gpf mask,
> > but in our case the related allocation is triggered not by a device driver
> > but directly by hibernation_snapshot. The call chain looks like this:
> >
> > STACK:
> > 0 schedule+1796 [0x5a7af0]
> > 1 io_schedule+98 [0x5a82ce]
> > 2 sync_page_killable+4 [0x1ec424]
> > 3 __wait_on_bit+204 [0x5a8bc4]
> > 4 add_to_page_cache_locked+2 [0x1ec766]
> > 5 shrink_page_list+2372 [0x1fc5b0]
> > 6 shrink_list+2496 [0x1fd02c]
> > 7 shrink_zone+932 [0x1fd3e0]
> > 8 try_to_free_pages+668 [0x1fe4bc]
> > 9 __alloc_pages_nodemask+1346 [0x1f5056]
> > 10 __get_free_pages+76 [0x1f52dc]
> > 11 __build_sched_domains+60 [0x144f98]
> > 12 partition_sched_domains+696 [0x145dcc]
> > 13 update_sched_domains+100 [0x146104]
> > 14 notifier_call_chain+166 [0x5ae112]
> > 15 raw_notifier_call_chain+44 [0x1800c4]
> > 16 _cpu_down+586 [0x59f212]
> > 17 disable_nonboot_cpus+354 [0x155ad2]
> > 18 hibernation_snapshot+324 [0x1a7938]
> > 19 hibernate+304 [0x1a7bcc]
> > 20 state_store+130 [0x1a645e]
> > 21 sysfs_write_file+264 [0x2b551c]
> > 22 vfs_write+190 [0x23f98a]
> > 23 sys_write+100 [0x23fb50]
> > 24 sysc_noemu+16 [0x118ff6]
> >
> > a possible fix would be to call disable_nonboot_cpus before suspending the
> > devices..
>
> This is going against the changes attempting to speed-up suspend and resume,
> such as the asynchronous suspend/resume patchset, so I don't agree with it.
In fact there's more to it.
enable_nonboot_cpus() has to be called before suspend_ops->wake() (and
analogously for hibernation), because of some ACPI-related ordering constraints
(calling them in the reverse orther leads to _serious_ problems during resume).
In turn, suspend_ops->wake() should be called before we resume devices, for
similar reasons.
So, your patch really would break things.
Rafael
Hi.
On Fri, 22 Jan 2010, Rafael J. Wysocki wrote:
> On Friday 22 January 2010, Sebastian Ott wrote:
> >
> > a possible fix would be to call disable_nonboot_cpus before suspending the
> > devices..
>
> This is going against the changes attempting to speed-up suspend and resume,
> such as the asynchronous suspend/resume patchset, so I don't agree with it.
Isn't the main benefit for this scenario that while a driver starts io and
waits for interrupts, the callback for the next device can be called? And
this can be done with one cpu as well.
>
> The real solution would be to remove the memory allocations from the
> _cpu_down() call path.
So you have to also ban allocations from all registered notifiers at the
cpu_chain. And since enable_nonboot_cpus is called before the devices are
woken up, the same would be true for _cpu_up() which may not be done
easily.
>
> BTW, this is one of the cases I and Ben are talking about where it's not
> practical to rework the code just to avoid memory allocation problems during
> suspend/resume.
Ok. All i'm saying is that in hibernation_snapshot/create_image memory
allocations are directely triggered after all devices were put to sleep /
before woken up - and this looks like a bug.
For the driver case - what about using your patch to not modify the gfp
mask but print a warning instead so that these drivers can be identified
and fixed.
Regards,
Sebastian
On Monday 25 January 2010, Sebastian Ott wrote:
> Hi.
>
> On Fri, 22 Jan 2010, Rafael J. Wysocki wrote:
>
> > On Friday 22 January 2010, Sebastian Ott wrote:
> > >
> > > a possible fix would be to call disable_nonboot_cpus before suspending the
> > > devices..
> >
> > This is going against the changes attempting to speed-up suspend and resume,
> > such as the asynchronous suspend/resume patchset, so I don't agree with it.
>
> Isn't the main benefit for this scenario that while a driver starts io and
> waits for interrupts, the callback for the next device can be called? And
> this can be done with one cpu as well.
That's the basic idea, but the additional CPUs help quite a bit.
> > The real solution would be to remove the memory allocations from the
> > _cpu_down() call path.
>
> So you have to also ban allocations from all registered notifiers at the
> cpu_chain. And since enable_nonboot_cpus is called before the devices are
> woken up, the same would be true for _cpu_up() which may not be done
> easily.
That's correct. BTW, that's what the CPU_TASKS_FROZEN bit is for among other
things (perhaps it may be used to fix this particular issue).
> > BTW, this is one of the cases I and Ben are talking about where it's not
> > practical to rework the code just to avoid memory allocation problems during
> > suspend/resume.
>
> Ok. All i'm saying is that in hibernation_snapshot/create_image memory
> allocations are directely triggered after all devices were put to sleep /
> before woken up - and this looks like a bug.
I agree, but that's because people don't remember that CPU hotplug is also
used for suspend/hibernation. I don't know at the moment how much effort
it would take to fix all of these problems appropriately, but I _guess_ that
would be quite some work. That, among other things, is why I sent the patch
to modify gfp_allowed_mask before suspending devices.
> For the driver case - what about using your patch to not modify the gfp
> mask but print a warning instead so that these drivers can be identified
> and fixed.
We'd get a lot of warnings and there are cases where we know they would
trigger (eg. ACPI internals). So, I'd rather like to reduce the users' pain
(by changing gfp_allowed_mask) than add to it (by adding a warning that's
guaranteed to show up).
Rafael
Hi Rafael,
since you didn't like the idea of calling the driver callbacks with just
one cpu enabled, we gave your patch: "MM / PM: Force GFP_NOIO during
suspend/hibernation and resume" a try and i can confirm that this
fixes the issue on s390. Will this go in 2.6.33/stable?
greetings,
Sebastian
On Monday 01 February 2010, Sebastian Ott wrote:
> Hi Rafael,
>
> since you didn't like the idea of calling the driver callbacks with just
> one cpu enabled, we gave your patch: "MM / PM: Force GFP_NOIO during
> suspend/hibernation and resume" a try and i can confirm that this
> fixes the issue on s390.
Great, thanks for testing!
> Will this go in 2.6.33/stable?
That depends on Andrew, actually.
Andrew, what do you think of the patch at:
http://patchwork.kernel.org/patch/74740/mbox/ ?
It helps people and I don't see any major drawbacks of it.
Rafael
On Mon, 2010-02-01 at 16:30 +0100, Rafael J. Wysocki wrote:
> On Monday 01 February 2010, Sebastian Ott wrote:
> > Hi Rafael,
> >
> > since you didn't like the idea of calling the driver callbacks with just
> > one cpu enabled, we gave your patch: "MM / PM: Force GFP_NOIO during
> > suspend/hibernation and resume" a try and i can confirm that this
> > fixes the issue on s390.
>
> Great, thanks for testing!
>
> > Will this go in 2.6.33/stable?
>
> That depends on Andrew, actually.
>
> Andrew, what do you think of the patch at:
> http://patchwork.kernel.org/patch/74740/mbox/ ?
>
> It helps people and I don't see any major drawbacks of it.
And it works too.
I will today continue the torture testing of hibernate mode, but so far
no hands.
Best regards,
Maxim Levitsky
On Mon, 1 Feb 2010 16:30:04 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> On Monday 01 February 2010, Sebastian Ott wrote:
> > Hi Rafael,
> >
> > since you didn't like the idea of calling the driver callbacks with just
> > one cpu enabled, we gave your patch: "MM / PM: Force GFP_NOIO during
> > suspend/hibernation and resume" a try and i can confirm that this
> > fixes the issue on s390.
>
> Great, thanks for testing!
>
> > Will this go in 2.6.33/stable?
>
> That depends on Andrew, actually.
>
> Andrew, what do you think of the patch at:
> http://patchwork.kernel.org/patch/74740/mbox/ ?
>
> It helps people and I don't see any major drawbacks of it.
>
Seems sane. A couple of minor things:
- the names mm_force_noio_allocations() and mm_allow_io_allocations()
are a bit sucky. Asymmetrical.
- the functions don't nest: if someone calls
mm_force_noio_allocations() twice in succession then the kernel is
all mucked up. Why not:
gfp_t mm_set_gfp_mask(gfp_t mask)
{
gfp_t ret = gfp_allowed_mask;
gfp_allowed_mask = mask;
return ret;
}
which is of course racy :) Could add a local spinlock if really worried.
All your current callers can easily save the old value in a local.
On Monday 01 February 2010, Andrew Morton wrote:
> On Mon, 1 Feb 2010 16:30:04 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
>
> > On Monday 01 February 2010, Sebastian Ott wrote:
> > > Hi Rafael,
> > >
> > > since you didn't like the idea of calling the driver callbacks with just
> > > one cpu enabled, we gave your patch: "MM / PM: Force GFP_NOIO during
> > > suspend/hibernation and resume" a try and i can confirm that this
> > > fixes the issue on s390.
> >
> > Great, thanks for testing!
> >
> > > Will this go in 2.6.33/stable?
> >
> > That depends on Andrew, actually.
> >
> > Andrew, what do you think of the patch at:
> > http://patchwork.kernel.org/patch/74740/mbox/ ?
> >
> > It helps people and I don't see any major drawbacks of it.
> >
>
> Seems sane. A couple of minor things:
>
> - the names mm_force_noio_allocations() and mm_allow_io_allocations()
> are a bit sucky. Asymmetrical.
Yeah. The lack of imagination. Sigh.
> - the functions don't nest: if someone calls
> mm_force_noio_allocations() twice in succession then the kernel is
> all mucked up. Why not:
>
> gfp_t mm_set_gfp_mask(gfp_t mask)
> {
> gfp_t ret = gfp_allowed_mask;
>
> gfp_allowed_mask = mask;
> return ret;
> }
>
> which is of course racy :) Could add a local spinlock if really worried.
I'm not sure how that helps. I'd need to read gfp_allowed_mask to obtain the
new value anyway.
> All your current callers can easily save the old value in a local.
Indeed.
Well, does the appended one look better?
Rafael
---
From: Rafael J. Wysocki <[email protected]>
Subject: MM / PM: Force GFP_NOIO during suspend/hibernation and resume (rev. 2)
There are quite a few GFP_KERNEL memory allocations made during
suspend/hibernation and resume that may cause the system to hang,
because the I/O operations they depend on cannot be completed due to
the underlying devices being suspended.
Avoid this problem by clearing the __GFP_IO and __GFP_FS bits in
gfp_allowed_mask before suspend/hibernation and restoring the
original values of these bits in gfp_allowed_mask durig the
subsequent resume.
Signed-off-by: Rafael J. Wysocki <[email protected]>
Reported-by: Maxim Levitsky <[email protected]>
---
include/linux/gfp.h | 8 ++++++++
kernel/power/hibernate.c | 9 +++++++++
kernel/power/suspend.c | 3 +++
3 files changed, 20 insertions(+)
Index: linux-2.6/kernel/power/hibernate.c
===================================================================
--- linux-2.6.orig/kernel/power/hibernate.c
+++ linux-2.6/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mod
int hibernation_snapshot(int platform_mode)
{
int error;
+ gfp_t saved_mask;
error = platform_begin(platform_mode);
if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mo
goto Close;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mo
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool pla
int hibernation_restore(int platform_mode)
{
int error;
+ gfp_t saved_mask;
pm_prepare_console();
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
dpm_resume_end(PMSG_RECOVER);
}
+ set_gfp_allowed_mask(saved_mask);
resume_console();
pm_restore_console();
return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mod
int hibernation_platform_enter(void)
{
int error;
+ gfp_t saved_mask;
if (!hibernation_ops)
return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
Index: linux-2.6/kernel/power/suspend.c
===================================================================
--- linux-2.6.orig/kernel/power/suspend.c
+++ linux-2.6/kernel/power/suspend.c
@@ -198,6 +198,7 @@ static int suspend_enter(suspend_state_t
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
+ gfp_t saved_mask;
if (!suspend_ops)
return -ENOSYS;
@@ -208,6 +209,7 @@ int suspend_devices_and_enter(suspend_st
goto Close;
}
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -224,6 +226,7 @@ int suspend_devices_and_enter(suspend_st
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
if (suspend_ops->end)
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h
+++ linux-2.6/include/linux/gfp.h
@@ -83,6 +83,7 @@ struct vm_area_struct;
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
+#define GFP_IOFS (__GFP_IO | __GFP_FS)
#ifdef CONFIG_NUMA
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -342,4 +343,11 @@ static inline void set_gfp_allowed_mask(
gfp_allowed_mask = mask;
}
+static inline gfp_t clear_gfp_allowed_mask(gfp_t mask)
+{
+ gfp_t ret = gfp_allowed_mask;
+ gfp_allowed_mask &= ~mask;
+ return ret;
+}
+
#endif /* __LINUX_GFP_H */
On Wed, 3 Feb 2010 02:44:23 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> +static inline gfp_t clear_gfp_allowed_mask(gfp_t mask)
> +{
> + gfp_t ret = gfp_allowed_mask;
> + gfp_allowed_mask &= ~mask;
> + return ret;
> +}
Fair enuf.
Of course, this is all horridly racy/buggy without locking. Would I be
correct in hoping that all the callers happen when the system is in
everyone-is-frozen mode?
Perhaps we should add some documentation (or even an assertion) to
prevent someone from using these interfaces from within normal code.
On Wednesday 03 February 2010, Andrew Morton wrote:
> On Wed, 3 Feb 2010 02:44:23 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
>
> > +static inline gfp_t clear_gfp_allowed_mask(gfp_t mask)
> > +{
> > + gfp_t ret = gfp_allowed_mask;
> > + gfp_allowed_mask &= ~mask;
> > + return ret;
> > +}
>
> Fair enuf.
>
> Of course, this is all horridly racy/buggy without locking. Would I be
> correct in hoping that all the callers happen when the system is in
> everyone-is-frozen mode?
As far as I can tell, gfp_allowed_mask is only touched during init apart from
this.
> Perhaps we should add some documentation (or even an assertion) to
> prevent someone from using these interfaces from within normal code.
I thought about that, but didn't invent anything smart enough.
Well, maybe except for a comment like "this must be called with pm_mutex held",
because that's the only case when it would be really safe.
Rafael
On Wed, 3 Feb 2010 23:34:37 +0100
"Rafael J. Wysocki" <[email protected]> wrote:
> On Wednesday 03 February 2010, Andrew Morton wrote:
> > On Wed, 3 Feb 2010 02:44:23 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> >
> > > +static inline gfp_t clear_gfp_allowed_mask(gfp_t mask)
> > > +{
> > > + gfp_t ret = gfp_allowed_mask;
> > > + gfp_allowed_mask &= ~mask;
> > > + return ret;
> > > +}
> >
> > Fair enuf.
> >
> > Of course, this is all horridly racy/buggy without locking. Would I be
> > correct in hoping that all the callers happen when the system is in
> > everyone-is-frozen mode?
>
> As far as I can tell, gfp_allowed_mask is only touched during init apart from
> this.
Well yes - the new interfaces are the problem - they're racy!
> > Perhaps we should add some documentation (or even an assertion) to
> > prevent someone from using these interfaces from within normal code.
>
> I thought about that, but didn't invent anything smart enough.
>
> Well, maybe except for a comment like "this must be called with pm_mutex held",
> because that's the only case when it would be really safe.
Is that the locking rule? My above guess was incorrect?
Maybe slip a
BUG_ON(!mutex_is_locked(&pm_mutex));
in there?
On Thursday 04 February 2010, Andrew Morton wrote:
> On Wed, 3 Feb 2010 23:34:37 +0100
> "Rafael J. Wysocki" <[email protected]> wrote:
>
> > On Wednesday 03 February 2010, Andrew Morton wrote:
> > > On Wed, 3 Feb 2010 02:44:23 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> > >
> > > > +static inline gfp_t clear_gfp_allowed_mask(gfp_t mask)
> > > > +{
> > > > + gfp_t ret = gfp_allowed_mask;
> > > > + gfp_allowed_mask &= ~mask;
> > > > + return ret;
> > > > +}
> > >
> > > Fair enuf.
> > >
> > > Of course, this is all horridly racy/buggy without locking. Would I be
> > > correct in hoping that all the callers happen when the system is in
> > > everyone-is-frozen mode?
> >
> > As far as I can tell, gfp_allowed_mask is only touched during init apart from
> > this.
>
> Well yes - the new interfaces are the problem - they're racy!
>
> > > Perhaps we should add some documentation (or even an assertion) to
> > > prevent someone from using these interfaces from within normal code.
> >
> > I thought about that, but didn't invent anything smart enough.
> >
> > Well, maybe except for a comment like "this must be called with pm_mutex held",
> > because that's the only case when it would be really safe.
>
> Is that the locking rule? My above guess was incorrect?
>
> Maybe slip a
>
> BUG_ON(!mutex_is_locked(&pm_mutex));
>
> in there?
That actually is a good idea, although I prefer WARN_ON (BUG_ON would be
rather blunt, so to speak, as it could kill setups not using suspend/hibernate
at all).
Updated patch follows.
Rafael
---
From: Rafael J. Wysocki <[email protected]>
Subject: MM / PM: Force GFP_NOIO during suspend/hibernation and resume (rev. 3)
There are quite a few GFP_KERNEL memory allocations made during
suspend/hibernation and resume that may cause the system to hang,
because the I/O operations they depend on cannot be completed due to
the underlying devices being suspended.
Mitigate this problem by clearing the __GFP_IO and __GFP_FS bits in
gfp_allowed_mask before suspend/hibernation and restoring the
original values of these bits in gfp_allowed_mask durig the
subsequent resume.
Signed-off-by: Rafael J. Wysocki <[email protected]>
Reported-by: Maxim Levitsky <[email protected]>
---
include/linux/gfp.h | 7 +++----
init/main.c | 2 +-
kernel/power/hibernate.c | 9 +++++++++
kernel/power/suspend.c | 3 +++
mm/page_alloc.c | 24 ++++++++++++++++++++++++
5 files changed, 40 insertions(+), 5 deletions(-)
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h
+++ linux-2.6/include/linux/gfp.h
@@ -83,6 +83,7 @@ struct vm_area_struct;
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
+#define GFP_IOFS (__GFP_IO | __GFP_FS)
#ifdef CONFIG_NUMA
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -337,9 +338,7 @@ void drain_local_pages(void *dummy);
extern gfp_t gfp_allowed_mask;
-static inline void set_gfp_allowed_mask(gfp_t mask)
-{
- gfp_allowed_mask = mask;
-}
+extern void set_gfp_allowed_mask(gfp_t mask);
+extern gfp_t clear_gfp_allowed_mask(gfp_t mask);
#endif /* __LINUX_GFP_H */
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -601,7 +601,7 @@ asmlinkage void __init start_kernel(void
local_irq_enable();
/* Interrupts are enabled now so all GFP allocations are safe. */
- set_gfp_allowed_mask(__GFP_BITS_MASK);
+ gfp_allowed_mask = __GFP_BITS_MASK;
kmem_cache_init_late();
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -76,6 +76,30 @@ unsigned long totalreserve_pages __read_
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+
+void set_gfp_allowed_mask(gfp_t mask)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask = mask;
+}
+
+gfp_t clear_gfp_allowed_mask(gfp_t mask)
+{
+ gfp_t ret = gfp_allowed_mask;
+
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask &= ~mask;
+ return ret;
+}
+
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
#endif
Index: linux-2.6/kernel/power/hibernate.c
===================================================================
--- linux-2.6.orig/kernel/power/hibernate.c
+++ linux-2.6/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mod
int hibernation_snapshot(int platform_mode)
{
int error;
+ gfp_t saved_mask;
error = platform_begin(platform_mode);
if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mo
goto Close;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mo
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool pla
int hibernation_restore(int platform_mode)
{
int error;
+ gfp_t saved_mask;
pm_prepare_console();
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
dpm_resume_end(PMSG_RECOVER);
}
+ set_gfp_allowed_mask(saved_mask);
resume_console();
pm_restore_console();
return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mod
int hibernation_platform_enter(void)
{
int error;
+ gfp_t saved_mask;
if (!hibernation_ops)
return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
Index: linux-2.6/kernel/power/suspend.c
===================================================================
--- linux-2.6.orig/kernel/power/suspend.c
+++ linux-2.6/kernel/power/suspend.c
@@ -198,6 +198,7 @@ static int suspend_enter(suspend_state_t
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
+ gfp_t saved_mask;
if (!suspend_ops)
return -ENOSYS;
@@ -208,6 +209,7 @@ int suspend_devices_and_enter(suspend_st
goto Close;
}
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -224,6 +226,7 @@ int suspend_devices_and_enter(suspend_st
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
if (suspend_ops->end)
On Thu, 4 Feb 2010 01:50:26 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> --- linux-2.6.orig/mm/page_alloc.c
> +++ linux-2.6/mm/page_alloc.c
> @@ -76,6 +76,30 @@ unsigned long totalreserve_pages __read_
> int percpu_pagelist_fraction;
> gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
>
> +/*
> + * The following functions are used by the suspend/hibernate code to temporarily
> + * change gfp_allowed_mask in order to avoid using I/O during memory allocations
> + * while devices are suspended. To avoid races with the suspend/hibernate code,
> + * they should always be called with pm_mutex held (gfp_allowed_mask also should
> + * only be modified with pm_mutex held, unless the suspend/hibernate code is
> + * guaranteed not to run in parallel with that modification).
> + */
> +
> +void set_gfp_allowed_mask(gfp_t mask)
> +{
> + WARN_ON(!mutex_is_locked(&pm_mutex));
> + gfp_allowed_mask = mask;
> +}
> +
> +gfp_t clear_gfp_allowed_mask(gfp_t mask)
> +{
> + gfp_t ret = gfp_allowed_mask;
> +
> + WARN_ON(!mutex_is_locked(&pm_mutex));
> + gfp_allowed_mask &= ~mask;
> + return ret;
> +}
Maybe put an ifdef CONFIG_foo around these so they don't get included
when they're unneeded?
On Wed, 3 Feb 2010 17:21:47 -0800 Andrew Morton <[email protected]> wrote:
> On Thu, 4 Feb 2010 01:50:26 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
>
> > --- linux-2.6.orig/mm/page_alloc.c
> > +++ linux-2.6/mm/page_alloc.c
> > @@ -76,6 +76,30 @@ unsigned long totalreserve_pages __read_
> > int percpu_pagelist_fraction;
> > gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
> >
> > +/*
> > + * The following functions are used by the suspend/hibernate code to temporarily
> > + * change gfp_allowed_mask in order to avoid using I/O during memory allocations
> > + * while devices are suspended. To avoid races with the suspend/hibernate code,
> > + * they should always be called with pm_mutex held (gfp_allowed_mask also should
> > + * only be modified with pm_mutex held, unless the suspend/hibernate code is
> > + * guaranteed not to run in parallel with that modification).
> > + */
> > +
> > +void set_gfp_allowed_mask(gfp_t mask)
> > +{
> > + WARN_ON(!mutex_is_locked(&pm_mutex));
> > + gfp_allowed_mask = mask;
> > +}
> > +
> > +gfp_t clear_gfp_allowed_mask(gfp_t mask)
> > +{
> > + gfp_t ret = gfp_allowed_mask;
> > +
> > + WARN_ON(!mutex_is_locked(&pm_mutex));
> > + gfp_allowed_mask &= ~mask;
> > + return ret;
> > +}
>
> Maybe put an ifdef CONFIG_foo around these so they don't get included
> when they're unneeded?
>
I guess this:
mm/built-in.o: In function `clear_gfp_allowed_mask':
: undefined reference to `pm_mutex'
mm/built-in.o: In function `set_gfp_allowed_mask':
: undefined reference to `pm_mutex'
kinda answers my question.
On Thursday 04 February 2010, Andrew Morton wrote:
> On Wed, 3 Feb 2010 17:21:47 -0800 Andrew Morton <[email protected]> wrote:
>
> > On Thu, 4 Feb 2010 01:50:26 +0100 "Rafael J. Wysocki" <[email protected]> wrote:
> >
> > > --- linux-2.6.orig/mm/page_alloc.c
> > > +++ linux-2.6/mm/page_alloc.c
> > > @@ -76,6 +76,30 @@ unsigned long totalreserve_pages __read_
> > > int percpu_pagelist_fraction;
> > > gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
> > >
> > > +/*
> > > + * The following functions are used by the suspend/hibernate code to temporarily
> > > + * change gfp_allowed_mask in order to avoid using I/O during memory allocations
> > > + * while devices are suspended. To avoid races with the suspend/hibernate code,
> > > + * they should always be called with pm_mutex held (gfp_allowed_mask also should
> > > + * only be modified with pm_mutex held, unless the suspend/hibernate code is
> > > + * guaranteed not to run in parallel with that modification).
> > > + */
> > > +
> > > +void set_gfp_allowed_mask(gfp_t mask)
> > > +{
> > > + WARN_ON(!mutex_is_locked(&pm_mutex));
> > > + gfp_allowed_mask = mask;
> > > +}
> > > +
> > > +gfp_t clear_gfp_allowed_mask(gfp_t mask)
> > > +{
> > > + gfp_t ret = gfp_allowed_mask;
> > > +
> > > + WARN_ON(!mutex_is_locked(&pm_mutex));
> > > + gfp_allowed_mask &= ~mask;
> > > + return ret;
> > > +}
> >
> > Maybe put an ifdef CONFIG_foo around these so they don't get included
> > when they're unneeded?
> >
>
> I guess this:
>
> mm/built-in.o: In function `clear_gfp_allowed_mask':
> : undefined reference to `pm_mutex'
> mm/built-in.o: In function `set_gfp_allowed_mask':
> : undefined reference to `pm_mutex'
>
> kinda answers my question.
Sorry about that. I tend to forget about test-compiling with CONFIG_PM unset.
The one below shouldn't have this issue.
Rafael
---
From: Rafael J. Wysocki <[email protected]>
Subject: MM / PM: Force GFP_NOIO during suspend/hibernation and resume (rev. 3)
There are quite a few GFP_KERNEL memory allocations made during
suspend/hibernation and resume that may cause the system to hang,
because the I/O operations they depend on cannot be completed due to
the underlying devices being suspended.
Mitigate this problem by clearing the __GFP_IO and __GFP_FS bits in
gfp_allowed_mask before suspend/hibernation and restoring the
original values of these bits in gfp_allowed_mask durig the
subsequent resume.
Signed-off-by: Rafael J. Wysocki <[email protected]>
Reported-by: Maxim Levitsky <[email protected]>
---
include/linux/gfp.h | 7 +++----
init/main.c | 2 +-
kernel/power/hibernate.c | 9 +++++++++
kernel/power/suspend.c | 3 +++
mm/page_alloc.c | 26 ++++++++++++++++++++++++++
5 files changed, 42 insertions(+), 5 deletions(-)
Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h
+++ linux-2.6/include/linux/gfp.h
@@ -83,6 +83,7 @@ struct vm_area_struct;
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_HARDWALL | __GFP_HIGHMEM | \
__GFP_MOVABLE)
+#define GFP_IOFS (__GFP_IO | __GFP_FS)
#ifdef CONFIG_NUMA
#define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
@@ -337,9 +338,7 @@ void drain_local_pages(void *dummy);
extern gfp_t gfp_allowed_mask;
-static inline void set_gfp_allowed_mask(gfp_t mask)
-{
- gfp_allowed_mask = mask;
-}
+extern void set_gfp_allowed_mask(gfp_t mask);
+extern gfp_t clear_gfp_allowed_mask(gfp_t mask);
#endif /* __LINUX_GFP_H */
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -601,7 +601,7 @@ asmlinkage void __init start_kernel(void
local_irq_enable();
/* Interrupts are enabled now so all GFP allocations are safe. */
- set_gfp_allowed_mask(__GFP_BITS_MASK);
+ gfp_allowed_mask = __GFP_BITS_MASK;
kmem_cache_init_late();
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -76,6 +76,32 @@ unsigned long totalreserve_pages __read_
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended. To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+
+void set_gfp_allowed_mask(gfp_t mask)
+{
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask = mask;
+}
+
+gfp_t clear_gfp_allowed_mask(gfp_t mask)
+{
+ gfp_t ret = gfp_allowed_mask;
+
+ WARN_ON(!mutex_is_locked(&pm_mutex));
+ gfp_allowed_mask &= ~mask;
+ return ret;
+}
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
#endif
Index: linux-2.6/kernel/power/hibernate.c
===================================================================
--- linux-2.6.orig/kernel/power/hibernate.c
+++ linux-2.6/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mod
int hibernation_snapshot(int platform_mode)
{
int error;
+ gfp_t saved_mask;
error = platform_begin(platform_mode);
if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mo
goto Close;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mo
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool pla
int hibernation_restore(int platform_mode)
{
int error;
+ gfp_t saved_mask;
pm_prepare_console();
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
dpm_resume_end(PMSG_RECOVER);
}
+ set_gfp_allowed_mask(saved_mask);
resume_console();
pm_restore_console();
return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mod
int hibernation_platform_enter(void)
{
int error;
+ gfp_t saved_mask;
if (!hibernation_ops)
return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
Index: linux-2.6/kernel/power/suspend.c
===================================================================
--- linux-2.6.orig/kernel/power/suspend.c
+++ linux-2.6/kernel/power/suspend.c
@@ -198,6 +198,7 @@ static int suspend_enter(suspend_state_t
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
+ gfp_t saved_mask;
if (!suspend_ops)
return -ENOSYS;
@@ -208,6 +209,7 @@ int suspend_devices_and_enter(suspend_st
goto Close;
}
suspend_console();
+ saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -224,6 +226,7 @@ int suspend_devices_and_enter(suspend_st
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
if (suspend_ops->end)