2006-10-27 08:00:24

by Richard Purdie

[permalink] [raw]
Subject: [PATCH, RFC/T] Fix handling of write failures to swap devices

Fix handling of write failures to swap devices.

Calling SetPageError(page) marks the data in memory as bad and processes using
the page in question will die unexpectedly. This isn't necessary as the data
in the memory page is still valid, just the copy on disk isn't. This patch
therefore removes this call.

Setting set_page_dirty(page) is good as the memory page will be retained and
processes don't die. It will try to write out the page again soon but a second
attempt at a write is probably no more likely to succeed than the first
resulting in IO loops. We can do better.

This patch attempts to unuse the page in a similar manner to swapoff. If
successful, mark the swap page as bad and remove it from use. If we fail to
remove all references, we fall back on set_page_dirty above which will retry
the write.

If we can mark the swap page as bad, adjust the VM accounting to reflect this.

Signed-off-by: Richard Purdie <[email protected]>

---

Comments and testing from people who know this area of code better than
me would be appreciated!

include/linux/swap.h | 1
mm/page_io.c | 13 ----
mm/swap_state.c | 1
mm/swapfile.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 160 insertions(+), 11 deletions(-)

Index: linux-2.6.19-rc3/include/linux/swap.h
===================================================================
--- linux-2.6.19-rc3.orig/include/linux/swap.h 2006-10-26 13:49:41.000000000 +0100
+++ linux-2.6.19-rc3/include/linux/swap.h 2006-10-26 13:50:40.000000000 +0100
@@ -253,6 +253,7 @@ extern sector_t map_swap_page(struct swa
extern struct swap_info_struct *get_swap_info_struct(unsigned);
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);
+extern void try_to_mark_swapbad(struct page *);
struct backing_dev_info;

extern spinlock_t swap_lock;
Index: linux-2.6.19-rc3/mm/swapfile.c
===================================================================
--- linux-2.6.19-rc3.orig/mm/swapfile.c 2006-10-26 13:49:41.000000000 +0100
+++ linux-2.6.19-rc3/mm/swapfile.c 2006-10-26 13:50:40.000000000 +0100
@@ -304,6 +304,25 @@ void swap_free(swp_entry_t entry)
}
}

+static void swap_markbad_free(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+ unsigned long offset = swp_offset(entry);
+
+ p = swap_info_get(entry);
+ if (p) {
+ printk(KERN_ERR "Marking swap offset %ld as bad\n", offset);
+
+ if (swap_entry_free(p, offset) != 0)
+ printk(KERN_ERR "Bad swap entry didn't have zero count?\n");
+ p->swap_map[offset] = SWAP_MAP_BAD;
+ p->pages--;
+ nr_swap_pages--;
+ total_swap_pages--;
+ spin_unlock(&swap_lock);
+ }
+}
+
/*
* How many references to page are currently swapped out?
*/
@@ -887,6 +906,143 @@ static int try_to_unuse(unsigned int typ
}

/*
+ * Try to unuse a page (after a write failure).
+ * The caller holds a reference on the entry so it can
+ * perform the final free and mark the swap page bad.
+ * The page writeback lock is already held.
+ */
+static int try_to_unuse_page(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+ unsigned short *swap_map;
+ unsigned int i = 0;
+ int retval = 0;
+
+ entry.val = page_private(page);
+
+ si = swap_info_get(entry);
+ if (!si)
+ return -ENODEV;
+
+ swap_map = &si->swap_map[swp_offset(entry)];
+ spin_unlock(&swap_lock);
+
+ if (*swap_map == SWAP_MAP_BAD)
+ return -EINVAL;
+
+ if (*swap_map == 1)
+ return 0;
+
+ lock_page(page);
+
+ /*
+ * Keep on scanning until all references have gone. Usually,
+ * one pass through is enough, but not necessarily.
+ * Try a few times, then give up.
+ */
+ for (i = 0; i < 5; i++) {
+
+ /* Note shmem_unuse will delete a swappage from
+ * the swap cache, unless the move to filepage failed.
+ * If so, it left swappage in cache yet lowered its
+ * swap count. We reincrement the count to try again later.
+ */
+ if ((*swap_map > 2) && shmem_unuse(entry, page) && PageSwapCache(page))
+ swap_duplicate(entry);
+
+ if (*swap_map == 1) {
+ /* shmem deleted the page for us */
+ unlock_page(page);
+ return 0;
+ }
+
+ /*
+ * Remove all references to entry.
+ */
+ if (*swap_map > 2) {
+ struct mm_struct *start_mm = &init_mm;
+ struct mm_struct *prev_mm = &init_mm;
+ struct mm_struct *mm;
+ struct list_head *p = &start_mm->mmlist;
+
+ atomic_inc(&start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
+ spin_lock(&mmlist_lock);
+ while (*swap_map > 2 && !retval &&
+ (p = p->next) != &start_mm->mmlist) {
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (atomic_inc_return(&mm->mm_users) == 1) {
+ atomic_dec(&mm->mm_users);
+ continue;
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
+ if (*swap_map > 2)
+ retval = unuse_mm(mm, entry, page);
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ mmput(start_mm);
+ if (retval) {
+ unlock_page(page);
+ return retval;
+ }
+ }
+
+ /*
+ * How could swap count reach 0x7fff when the maximum
+ * pid is 0x7fff, and there's no way to repeat a swap
+ * page within an mm (except in shmem, where it's the
+ * shared object which takes the reference count)?
+ * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+ *
+ * We know "Undead"s can happen, they're okay, so don't
+ * report them; but do report if we reset SWAP_MAP_MAX.
+ */
+ if (*swap_map == SWAP_MAP_MAX) {
+ spin_lock(&swap_lock);
+ *swap_map = 2;
+ spin_unlock(&swap_lock);
+ printk(KERN_WARNING "unuse_page: cleared swap entry overflow\n");
+ }
+
+ if (PageSwapCache(page) && (*swap_map == 2)) {
+ /* We removed all the references, just the swapcache
+ * reference itself left */
+ write_lock_irq(&swapper_space.tree_lock);
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&swapper_space.tree_lock);
+ swap_free(entry);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ unlock_page(page);
+ return -EAGAIN;
+}
+
+/*
+ * Try to mark a swap page as bad after a write failure
+ */
+void try_to_mark_swapbad(struct page *page)
+{
+ swp_entry_t entry;
+ entry.val = page_private(page);
+
+ /* Pin entry so we call the final free */
+ swap_duplicate(entry);
+ if (try_to_unuse_page(page) < 0)
+ swap_free(entry);
+ else
+ swap_markbad_free(entry);
+}
+
+/*
* After a successful try_to_unuse, if no swap is now in use, we know
* we can empty the mmlist. swap_lock must be held on entry and exit.
* Note that mmlist_lock nests inside swap_lock, and an mm must be
Index: linux-2.6.19-rc3/mm/page_io.c
===================================================================
--- linux-2.6.19-rc3.orig/mm/page_io.c 2006-10-26 13:49:41.000000000 +0100
+++ linux-2.6.19-rc3/mm/page_io.c 2006-10-26 13:50:40.000000000 +0100
@@ -53,21 +53,14 @@ static int end_swap_bio_write(struct bio
return 1;

if (!uptodate) {
- SetPageError(page);
- /*
- * We failed to write the page out to swap-space.
- * Re-dirty the page in order to avoid it being reclaimed.
- * Also print a dire warning that things will go BAD (tm)
- * very quickly.
- *
- * Also clear PG_reclaim to avoid rotate_reclaimable_page()
- */
+ /* Mark page as dirty so it stays in memory regardless.
+ Worst case, we retry the write but attempt to mark it bad */
set_page_dirty(page);
+ try_to_mark_swapbad(page);
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_sector);
- ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
Index: linux-2.6.19-rc3/mm/swap_state.c
===================================================================
--- linux-2.6.19-rc3.orig/mm/swap_state.c 2006-10-26 13:49:41.000000000 +0100
+++ linux-2.6.19-rc3/mm/swap_state.c 2006-10-26 13:50:40.000000000 +0100
@@ -125,7 +125,6 @@ void __delete_from_swap_cache(struct pag
{
BUG_ON(!PageLocked(page));
BUG_ON(!PageSwapCache(page));
- BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));

radix_tree_delete(&swapper_space.page_tree, page_private(page));



2006-10-27 08:22:18

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

Richard Purdie wrote:
> Fix handling of write failures to swap devices.
>
> Calling SetPageError(page) marks the data in memory as bad and processes using
> the page in question will die unexpectedly. This isn't necessary as the data
> in the memory page is still valid, just the copy on disk isn't. This patch
> therefore removes this call.
>
> Setting set_page_dirty(page) is good as the memory page will be retained and
> processes don't die. It will try to write out the page again soon but a second
> attempt at a write is probably no more likely to succeed than the first
> resulting in IO loops. We can do better.
>
> This patch attempts to unuse the page in a similar manner to swapoff. If
> successful, mark the swap page as bad and remove it from use. If we fail to
> remove all references, we fall back on set_page_dirty above which will retry
> the write.
>
> If we can mark the swap page as bad, adjust the VM accounting to reflect this.
>
> Signed-off-by: Richard Purdie <[email protected]>
>
> ---
>
> Comments and testing from people who know this area of code better than
> me would be appreciated!

This is the right approach to handling swap write errors. However, you need
to cut down on the amount of code duplication. Also, if you hit that BUG_ON,
then you probably have a bug, don't remove it!

Otherwise, this would be nice to have so it's great you're working on it,
thanks.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-10-27 08:45:09

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Fri, 2006-10-27 at 18:22 +1000, Nick Piggin wrote:
> Richard Purdie wrote:
> > Comments and testing from people who know this area of code better than
> > me would be appreciated!
>
> This is the right approach to handling swap write errors. However, you need
> to cut down on the amount of code duplication.

The code is subtly different to the swapoff code but I'll take another
look and see if I can refactor it now I have it all working.

> Also, if you hit that BUG_ON, then you probably have a bug, don't
> remove it!

I gave that a lot of thought. We are in a write handler and have to
handle the write error from there so the page will be marked as
writeback. That function appears to be safe to call with that set
through the new code path I added (which wouldn't have happened in the
past). I therefore decided it was safe and the simplest solution was to
remove the BUG_ON. If anyone can see a problem with a page being in
writeback in that function, please enlighten me though!

Cheers,

Richard

2006-10-27 09:35:53

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Fri, 2006-10-27 at 18:22 +1000, Nick Piggin wrote:
> This is the right approach to handling swap write errors. However, you need
> to cut down on the amount of code duplication.

I've just looked at this and there isn't that much code duplicated, just
a couple of big comment blocks which make it look like that. Those
sections are similar in spirit but have been modified to work under
different circumstances. Having looked again, I don't think its worth
trying to merge them as it will become too unreadable.

Cheers,

Richard



2006-10-27 21:19:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Fri, 27 Oct 2006 08:59:55 +0100
Richard Purdie <[email protected]> wrote:

> Fix handling of write failures to swap devices.
>
> Calling SetPageError(page) marks the data in memory as bad and processes using
> the page in question will die unexpectedly. This isn't necessary as the data
> in the memory page is still valid, just the copy on disk isn't. This patch
> therefore removes this call.
>
> Setting set_page_dirty(page) is good as the memory page will be retained and
> processes don't die. It will try to write out the page again soon but a second
> attempt at a write is probably no more likely to succeed than the first
> resulting in IO loops. We can do better.
>
> This patch attempts to unuse the page in a similar manner to swapoff. If
> successful, mark the swap page as bad and remove it from use. If we fail to
> remove all references, we fall back on set_page_dirty above which will retry
> the write.
>
> If we can mark the swap page as bad, adjust the VM accounting to reflect this.
>

Sounds like a reasonable approach. Please copy Hugh (our lead swapoff maintainer)
on this work.

How was this tested?

2006-10-28 04:55:07

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

Richard Purdie wrote:
> On Fri, 2006-10-27 at 18:22 +1000, Nick Piggin wrote:
>
>>Richard Purdie wrote:
>>
>>>Comments and testing from people who know this area of code better than
>>>me would be appreciated!
>>
>>This is the right approach to handling swap write errors. However, you need
>>to cut down on the amount of code duplication.
>
>
> The code is subtly different to the swapoff code but I'll take another
> look and see if I can refactor it now I have it all working.

Subtly different code is the worst kind of code to be duplicating. It
really needs improving, I think.

>>Also, if you hit that BUG_ON, then you probably have a bug, don't
>>remove it!
>
>
> I gave that a lot of thought. We are in a write handler and have to
> handle the write error from there so the page will be marked as
> writeback. That function appears to be safe to call with that set
> through the new code path I added (which wouldn't have happened in the
> past). I therefore decided it was safe and the simplest solution was to
> remove the BUG_ON. If anyone can see a problem with a page being in
> writeback in that function, please enlighten me though!

It's just the wrong thing to do if the page has been set writeback with
a valid mapping. Presently we don't do any mapping specific accounting
in that path, but we could.

But now that I look at your patch, I don't think it is going to work.
end_swap_bio_write can be called from interrupt context, so you can't
lock the page, and you can't take any of those swap specific spinlocks
either.

You say that SetPageError makes the processes die unexpectedly? How and
where? We use SetPageError for IO errors, and it doesn't mean the page
has errors AFAIK.

The best policy would probably be to keep the end_page_writeback path as
it is, and then detect the PageError in the swap out path somewhere.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-10-28 10:43:53

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Sat, 2006-10-28 at 14:55 +1000, Nick Piggin wrote:
> Richard Purdie wrote:
> > On Fri, 2006-10-27 at 18:22 +1000, Nick Piggin wrote:
> >>This is the right approach to handling swap write errors. However, you need
> >>to cut down on the amount of code duplication.
> >
> > The code is subtly different to the swapoff code but I'll take another
> > look and see if I can refactor it now I have it all working.
>
> Subtly different code is the worst kind of code to be duplicating. It
> really needs improving, I think.

I'm open to suggestions as to how to do it. The main problem was that in
in order to mark the page bad you needed to hold an extra reference on
the page so that the "mark bad" code would be the last code to touch the
page. The swapoff code doesn't need this and I don't like the idea of
passing some count value to a common function as that would be
confusing. I guess swapoff could start taking an extra reference but I
can see people objecting to that too as it doesn't need it.

> It's just the wrong thing to do if the page has been set writeback with
> a valid mapping. Presently we don't do any mapping specific accounting
> in that path, but we could.

Ok, I couldn't find a specific problem with calling that code with a
page set as writeback and I don't know enough about that code to realise
its the "wrong thing to do". I didn't really want to duplicate the set
of delete_from_swap_cache functions.

> But now that I look at your patch, I don't think it is going to work.
> end_swap_bio_write can be called from interrupt context, so you can't
> lock the page, and you can't take any of those swap specific spinlocks
> either.

This would seem to be a major problem. My tests were done with a driver
that wouldn't use interrupt context there. Is the writeback lock enough
or is the page lock really needed? I suspect some of the functions don't
like being called without the page lock.

> You say that SetPageError makes the processes die unexpectedly? How and
> where? We use SetPageError for IO errors, and it doesn't mean the page
> has errors AFAIK.

Most of the testing I did was on ARM. If you mark a page with
SetPageError, the next time userspace tries to access it, you get a data
abort and the process dies with a SIGBUS or other faults (even if the
page is marked as up to date). The effect was very clear and I just
assumed it was behaving as intended. It was this problem I started to
address as I didn't like processes randomly dieing...

Ignoring the filesystem calls to PageError(), the remaining two are
wait_on_page_writeback_range() and write_one_page() which something
like:

if (PageError(page))
return -EIO;

I'd guess something somewhere acts on the EIO and ignores the fact the
page is uptodate. I'm struggling a bit to work out the exact codepaths
though.

> The best policy would probably be to keep the end_page_writeback path as
> it is, and then detect the PageError in the swap out path somewhere.

I wanted to do it this way but couldn't as you end up with processes
dieing if you don't correct the PageError before the page is accessed.
Can someone comment on exactly what PageError should/shouldn't meant? Is
this an ARM specific issue or does it happen on other architectures too?

The possible solutions I can see are:

1. ARM has the PageError behaviour wrong somewhere and if so, there
might be different ways to handle this.
2. The PageError behaviour is correct so we need to find some other way
to mark a page as needing marking as bad and do so out of interrupt
context.
3. Any other ideas?

Can you give me a hint as to where in the swap out path you might want
to detect the error? The only way I can see is to have the end of
swap_writepage() wait_on_page_writeback() but that would appear to have
significant performance implications. I'm struggling to see a way you
can do this which is always outside interrupt context.

Cheers,

Richard


2006-10-28 12:10:48

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

Richard Purdie wrote:
> On Sat, 2006-10-28 at 14:55 +1000, Nick Piggin wrote:
>
>>Richard Purdie wrote:
>>
>>>On Fri, 2006-10-27 at 18:22 +1000, Nick Piggin wrote:
>>>
>>>>This is the right approach to handling swap write errors. However, you need
>>>>to cut down on the amount of code duplication.
>>>
>>>The code is subtly different to the swapoff code but I'll take another
>>>look and see if I can refactor it now I have it all working.
>>
>>Subtly different code is the worst kind of code to be duplicating. It
>>really needs improving, I think.
>
>
> I'm open to suggestions as to how to do it. The main problem was that in
> in order to mark the page bad you needed to hold an extra reference on
> the page so that the "mark bad" code would be the last code to touch the
> page. The swapoff code doesn't need this and I don't like the idea of
> passing some count value to a common function as that would be
> confusing. I guess swapoff could start taking an extra reference but I
> can see people objecting to that too as it doesn't need it.

If you have the page locked (which you can't from where you're trying,
but we'll tackle that next), and the page is swapcache, then doesn't
that pin you a reference to the swap entry as well? So I still don't
see why you need that extra reference.... I know there is a
try_to_unuse_page/entry hiding in try_to_unuse somewhere ;)

>>It's just the wrong thing to do if the page has been set writeback with
>>a valid mapping. Presently we don't do any mapping specific accounting
>>in that path, but we could.
>
>
> Ok, I couldn't find a specific problem with calling that code with a
> page set as writeback and I don't know enough about that code to realise
> its the "wrong thing to do". I didn't really want to duplicate the set
> of delete_from_swap_cache functions.

That's just how the pagecache works in general. You might be right
that there isn't anything wrong in this case, but hopefully we can
avoid it.

>>But now that I look at your patch, I don't think it is going to work.
>>end_swap_bio_write can be called from interrupt context, so you can't
>>lock the page, and you can't take any of those swap specific spinlocks
>>either.
>
>
> This would seem to be a major problem. My tests were done with a driver
> that wouldn't use interrupt context there. Is the writeback lock enough
> or is the page lock really needed? I suspect some of the functions don't
> like being called without the page lock.

The writeback bit isn't a lock, and yes the page lock is needed when
dealing with swapcache.

That isn't your only problem though, and we simply don't want to do
this (potentially expensive) unusing from interrupt context. Noting
the error and dealing with it in process context I think is the best
way to do it.

BTW. what's the driver you're using? It might be useful to have an
option to schedule a timer for the completions (at least the ones
which generate errors).

>>You say that SetPageError makes the processes die unexpectedly? How and
>>where? We use SetPageError for IO errors, and it doesn't mean the page
>>has errors AFAIK.
>
>
> Most of the testing I did was on ARM. If you mark a page with
> SetPageError, the next time userspace tries to access it, you get a data
> abort and the process dies with a SIGBUS or other faults (even if the
> page is marked as up to date). The effect was very clear and I just
> assumed it was behaving as intended. It was this problem I started to
> address as I didn't like processes randomly dieing...

I can't for the life of me see how or why this is happening. I don't
doubt it is a problem, but it smells like another bug.

> Ignoring the filesystem calls to PageError(), the remaining two are
> wait_on_page_writeback_range() and write_one_page() which something
> like:
>
> if (PageError(page))
> return -EIO;
>
> I'd guess something somewhere acts on the EIO and ignores the fact the
> page is uptodate. I'm struggling a bit to work out the exact codepaths
> though.

I don't think we have to worry about any of the filesystem code, even
if we are talking about a swap file. It should all go straight through
page_io.c. And wait_on_page_writeback doesn't have its return value
checked in any of the swap code AFAIK...

Still, something must be triggering it somewhere.

>>The best policy would probably be to keep the end_page_writeback path as
>>it is, and then detect the PageError in the swap out path somewhere.
>
>
> I wanted to do it this way but couldn't as you end up with processes
> dieing if you don't correct the PageError before the page is accessed.
> Can someone comment on exactly what PageError should/shouldn't meant? Is
> this an ARM specific issue or does it happen on other architectures too?
>
> The possible solutions I can see are:
>
> 1. ARM has the PageError behaviour wrong somewhere and if so, there
> might be different ways to handle this.
> 2. The PageError behaviour is correct so we need to find some other way
> to mark a page as needing marking as bad and do so out of interrupt
> context.

PageError means that the page could not be written / read from disk.
This is true in the pagecache / filesystem code too, and it is
PageUptodate that tells whether the page itself is valid or not.

> 3. Any other ideas?
>
> Can you give me a hint as to where in the swap out path you might want
> to detect the error? The only way I can see is to have the end of
> swap_writepage() wait_on_page_writeback() but that would appear to have
> significant performance implications. I'm struggling to see a way you
> can do this which is always outside interrupt context.

swap_writepage sounds better than . You've even got the page
already locked for you, so the job's half done ;)

What performance implications did you imagine? The fastpath will
just be a single PageError test, and error case slowpath doesn't
matter too much beyond having something that actually works.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-10-30 11:55:59

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Sat, 2006-10-28 at 22:10 +1000, Nick Piggin wrote:
> Richard Purdie wrote:
> > I'm open to suggestions as to how to do it. The main problem was that in
> > in order to mark the page bad you needed to hold an extra reference on
> > the page so that the "mark bad" code would be the last code to touch the
> > page. The swapoff code doesn't need this and I don't like the idea of
> > passing some count value to a common function as that would be
> > confusing. I guess swapoff could start taking an extra reference but I
> > can see people objecting to that too as it doesn't need it.
>
> If you have the page locked (which you can't from where you're trying,
> but we'll tackle that next), and the page is swapcache, then doesn't
> that pin you a reference to the swap entry as well? So I still don't
> see why you need that extra reference.... I know there is a
> try_to_unuse_page/entry hiding in try_to_unuse somewhere ;)

The final call to swap_free() releases the swap mapping and beyond that
point you can't tell which swap page you were dealing with.
try_to_unuse_page() calls swap_free() each time it removes a reference.
To make sure we control the last swap_free(), I had it take a reference
with swap_duplicate() and then we guarantee we call the last free. The
last free can then mark the page bad and take care of the accounting.
This does mean try_to_unuse_page() succeeds when the refcount is 2 and
not 1 as in the case of the usual try_to_unuse().

> That isn't your only problem though, and we simply don't want to do
> this (potentially expensive) unusing from interrupt context. Noting
> the error and dealing with it in process context I think is the best
> way to do it.

The reasoning was that this circumstance should be extremely rare. If it
happens, we have a hardware problem. Recovering from that hardware
problem gracefully is more important than a slightly longer interrupt.
But yes, process context would be nicer, *if* we can find a way to do
it.

> BTW. what's the driver you're using? It might be useful to have an
> option to schedule a timer for the completions (at least the ones
> which generate errors).

Its a custom driver. I'm sure I can force it into using interrupt
context for the completions to try and test things although I'm also
fairly sure the existing patch will break when I do that for the reasons
you mention :-(.

> >>You say that SetPageError makes the processes die unexpectedly? How and
> >>where? We use SetPageError for IO errors, and it doesn't mean the page
> >>has errors AFAIK.
> >
> > Most of the testing I did was on ARM. If you mark a page with
> > SetPageError, the next time userspace tries to access it, you get a data
> > abort and the process dies with a SIGBUS or other faults (even if the
> > page is marked as up to date). The effect was very clear and I just
> > assumed it was behaving as intended. It was this problem I started to
> > address as I didn't like processes randomly dieing...
>
> I can't for the life of me see how or why this is happening. I don't
> doubt it is a problem, but it smells like another bug.

I can't work out the code path it happens in and until I do, I'm not
sure how I can track it down...

> > Ignoring the filesystem calls to PageError(), the remaining two are
> > wait_on_page_writeback_range() and write_one_page() which something
> > like:
> >
> > if (PageError(page))
> > return -EIO;
> >
> > I'd guess something somewhere acts on the EIO and ignores the fact the
> > page is uptodate. I'm struggling a bit to work out the exact codepaths
> > though.
>
> I don't think we have to worry about any of the filesystem code, even
> if we are talking about a swap file. It should all go straight through
> page_io.c. And wait_on_page_writeback doesn't have its return value
> checked in any of the swap code AFAIK...
>
> Still, something must be triggering it somewhere.

Something must be but I wish I knew what/where...

> > 3. Any other ideas?
> >
> > Can you give me a hint as to where in the swap out path you might want
> > to detect the error? The only way I can see is to have the end of
> > swap_writepage() wait_on_page_writeback() but that would appear to have
> > significant performance implications. I'm struggling to see a way you
> > can do this which is always outside interrupt context.
>
> swap_writepage sounds better than . You've even got the page
> already locked for you, so the job's half done ;)
>
> What performance implications did you imagine? The fastpath will
> just be a single PageError test, and error case slowpath doesn't
> matter too much beyond having something that actually works.

If swap_writepage() is to check for an error, it will have to wait until
the IO is complete with something like wait_on_page_writeback() before
it can check. The performance implication is the extra
wait_on_page_writeback() on every call to swap_writepage(). In the
meantime, it will have to give up the page lock and then reacquire it.
Unless you're thinking of something different?

Regards,

Richard



2006-11-01 05:27:00

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

Notice swap write errors during page reclaim, and deallocate the swap entry
which is backing the swapcache. This allows the page error to be cleared and
the page be allocated to a new entry, rather than the page to becoming pinned
forever.

Based on code from Richard Purdie <[email protected]>

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c 2006-11-01 14:58:28.000000000 +1100
+++ linux-2.6/mm/shmem.c 2006-11-01 16:10:21.000000000 +1100
@@ -729,6 +729,8 @@ static int shmem_unuse_inode(struct shme
struct page *subdir;
swp_entry_t *ptr;
int offset;
+ int moved;
+ int error;

idx = 0;
ptr = info->i_direct;
@@ -786,17 +788,20 @@ lost2:
found:
idx += offset;
inode = &info->vfs_inode;
- if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
+ error = PageError(page);
+ moved = (move_from_swap_cache(page, idx, inode->i_mapping) == 0);
+ if (moved) {
info->flags |= SHMEM_PAGEIN;
shmem_swp_set(info, ptr + offset, 0);
}
shmem_swp_unmap(ptr);
spin_unlock(&info->lock);
- /*
- * Decrement swap count even when the entry is left behind:
- * try_to_unuse will skip over mms, then reincrement count.
- */
- swap_free(entry);
+ if (moved) {
+ if (!error)
+ swap_free(entry);
+ else
+ swap_free_markbad(entry);
+ }
return 1;
}

Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c 2006-11-01 14:58:28.000000000 +1100
+++ linux-2.6/mm/swapfile.c 2006-11-01 16:14:26.000000000 +1100
@@ -304,6 +304,23 @@ void swap_free(swp_entry_t entry)
}
}

+void swap_free_markbad(swp_entry_t entry)
+{
+ struct swap_info_struct * p;
+
+ p = swap_info_get(entry);
+ if (p) {
+ unsigned long offset = swp_offset(entry);
+ if (swap_entry_free(p, offset) == 0) {
+ p->swap_map[offset] = SWAP_MAP_BAD;
+ p->pages--;
+ nr_swap_pages--;
+ total_swap_pages--;
+ }
+ spin_unlock(&swap_lock);
+ }
+}
+
/*
* How many references to page are currently swapped out?
*/
@@ -618,6 +635,207 @@ static int unuse_mm(struct mm_struct *mm
return 0;
}

+static int try_to_unuse_entry(swp_entry_t entry, unsigned short *swap_map,
+ struct page *page, struct mm_struct **start_mm_p)
+{
+ struct mm_struct *start_mm;
+ unsigned short swcount;
+ int retval = 0;
+ int shmem;
+
+ if (start_mm_p)
+ start_mm = *start_mm_p;
+ else {
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+ }
+
+ /*
+ * Don't hold on to start_mm if it looks like exiting.
+ */
+ if (atomic_read(&start_mm->mm_users) == 1) {
+ mmput(start_mm);
+ start_mm = &init_mm;
+ atomic_inc(&init_mm.mm_users);
+ }
+
+ /*
+ * Wait for and lock page. When do_swap_page races with try_to_unuse,
+ * do_swap_page can handle the fault much faster than try_to_unuse can
+ * locate the entry. This apparently redundant "wait_on_page_locked"
+ * lets try_to_unuse defer to do_swap_page in such a case - in some
+ * tests, do_swap_page and try_to_unuse repeatedly compete.
+ */
+retry:
+ wait_on_page_locked(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
+ wait_on_page_writeback(page);
+
+ /*
+ * Remove all references to entry.
+ * Whenever we reach init_mm, there's no address space to search, but
+ * use it as a reminder to search shmem.
+ */
+ shmem = 0;
+ swcount = *swap_map;
+ if (swcount > 1) {
+ if (start_mm == &init_mm)
+ shmem = shmem_unuse(entry, page);
+ else
+ retval = unuse_mm(start_mm, entry, page);
+ }
+ if (*swap_map > 1) {
+ int set_start_mm = (*swap_map >= swcount);
+ struct list_head *p = &start_mm->mmlist;
+ struct mm_struct *new_start_mm = start_mm;
+ struct mm_struct *prev_mm = start_mm;
+ struct mm_struct *mm;
+
+ atomic_inc(&new_start_mm->mm_users);
+ atomic_inc(&prev_mm->mm_users);
+ spin_lock(&mmlist_lock);
+ while (*swap_map > 1 && !retval &&
+ (p = p->next) != &start_mm->mmlist) {
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ continue;
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+
+ cond_resched();
+
+ swcount = *swap_map;
+ if (swcount <= 1)
+ ;
+ else if (mm == &init_mm) {
+ set_start_mm = 1;
+ shmem = shmem_unuse(entry, page);
+ } else
+ retval = unuse_mm(mm, entry, page);
+ if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
+ new_start_mm = mm;
+ set_start_mm = 0;
+ }
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ mmput(start_mm);
+ start_mm = new_start_mm;
+ }
+ if (retval) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
+ }
+
+ /*
+ * How could swap count reach 0x7fff when the maximum pid is 0x7fff,
+ * and there's no way to repeat a swap page within an mm (except in
+ * shmem, where it's the shared object which takes the reference
+ * count)? We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
+ *
+ * If that's wrong, then we should worry more about exit_mmap() and
+ * do_munmap() cases described above: we might be resetting
+ * SWAP_MAP_MAX too early here. We know "Undead"s can happen, they're
+ * okay, so don't report them; but do report if we reset SWAP_MAP_MAX.
+ */
+ if (*swap_map == SWAP_MAP_MAX) {
+ spin_lock(&swap_lock);
+ *swap_map = 1;
+ spin_unlock(&swap_lock);
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "try_to_unuse_entry: cleared swap entry overflow\n");
+ }
+
+ /*
+ * If a reference remains (rare), we would like to leave the page in
+ * the swap cache; but try_to_unmap could then re-duplicate the entry
+ * once we drop page lock, so we might loop indefinitely; also, that
+ * page could not be swapped out to other storage meanwhile. So:
+ * delete from cache even if there's another reference, after ensuring
+ * that the data has been saved to disk - since if the reference
+ * remains (rarer), it will be read from disk into another page.
+ * Splitting into two pages would be incorrect if swap supported
+ * "shared private" pages, but they are handled by tmpfs files.
+ *
+ * Note shmem_unuse already deleted a swappage from the swap cache,
+ * unless the move to filepage failed: in which case it left swappage
+ * in cache, lowered its swap count to pass quickly through the loops
+ * above, and now we must reincrement count to try again later.
+ */
+ if (PageSwapCache(page)) {
+ if ((*swap_map > 1) && PageDirty(page)) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ };
+
+ swap_writepage(page, &wbc);
+ goto retry;
+ }
+ if (!shmem) {
+ int error = PageError(page);
+
+ write_lock_irq(&swapper_space.tree_lock);
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&swapper_space.tree_lock);
+ page_cache_release(page); /* the swapcache ref */
+
+ if (!error)
+ swap_free(entry);
+ else
+ swap_free_markbad(entry);
+ }
+ }
+
+ /*
+ * So we could skip searching mms once swap count went to 1, we did not
+ * mark any present ptes as dirty: must mark page dirty so shrink_list
+ * will preserve it.
+ */
+ SetPageDirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (start_mm_p)
+ *start_mm_p = start_mm;
+ else
+ mmput(start_mm);
+
+out:
+ return retval;
+}
+
+void try_to_unuse_page_entry(struct page *page)
+{
+ struct swap_info_struct *si;
+ unsigned short *swap_map;
+ swp_entry_t entry;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!PageSwapCache(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(PagePrivate(page));
+
+ entry.val = page_private(page);
+ si = swap_info_get(entry);
+ if (!si) {
+ WARN_ON(1);
+ return;
+ }
+ swap_map = &si->swap_map[swp_offset(entry)];
+ spin_unlock(&swap_lock);
+
+ BUG_ON(*swap_map == SWAP_MAP_BAD);
+
+ try_to_unuse_entry(entry, swap_map, page, NULL);
+}
+
/*
* Scan swap_map from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
@@ -666,13 +884,10 @@ static int try_to_unuse(unsigned int typ
struct swap_info_struct * si = &swap_info[type];
struct mm_struct *start_mm;
unsigned short *swap_map;
- unsigned short swcount;
struct page *page;
swp_entry_t entry;
unsigned int i = 0;
int retval = 0;
- int reset_overflow = 0;
- int shmem;

/*
* When searching mms for an entry, a good strategy is to
@@ -724,152 +939,10 @@ static int try_to_unuse(unsigned int typ
break;
}

- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- atomic_inc(&init_mm.mm_users);
- }
-
- /*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
- wait_on_page_writeback(page);
-
- /*
- * Remove all references to entry.
- * Whenever we reach init_mm, there's no address space
- * to search, but use it as a reminder to search shmem.
- */
- shmem = 0;
- swcount = *swap_map;
- if (swcount > 1) {
- if (start_mm == &init_mm)
- shmem = shmem_unuse(entry, page);
- else
- retval = unuse_mm(start_mm, entry, page);
- }
- if (*swap_map > 1) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *prev_mm = start_mm;
- struct mm_struct *mm;
-
- atomic_inc(&new_start_mm->mm_users);
- atomic_inc(&prev_mm->mm_users);
- spin_lock(&mmlist_lock);
- while (*swap_map > 1 && !retval &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- if (!atomic_inc_not_zero(&mm->mm_users))
- continue;
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- prev_mm = mm;
-
- cond_resched();
-
- swcount = *swap_map;
- if (swcount <= 1)
- ;
- else if (mm == &init_mm) {
- set_start_mm = 1;
- shmem = shmem_unuse(entry, page);
- } else
- retval = unuse_mm(mm, entry, page);
- if (set_start_mm && *swap_map < swcount) {
- mmput(new_start_mm);
- atomic_inc(&mm->mm_users);
- new_start_mm = mm;
- set_start_mm = 0;
- }
- spin_lock(&mmlist_lock);
- }
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
- if (retval) {
- unlock_page(page);
- page_cache_release(page);
+ retval = try_to_unuse_entry(entry, swap_map, page,
+ &start_mm);
+ if (retval)
break;
- }
-
- /*
- * How could swap count reach 0x7fff when the maximum
- * pid is 0x7fff, and there's no way to repeat a swap
- * page within an mm (except in shmem, where it's the
- * shared object which takes the reference count)?
- * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
- *
- * If that's wrong, then we should worry more about
- * exit_mmap() and do_munmap() cases described above:
- * we might be resetting SWAP_MAP_MAX too early here.
- * We know "Undead"s can happen, they're okay, so don't
- * report them; but do report if we reset SWAP_MAP_MAX.
- */
- if (*swap_map == SWAP_MAP_MAX) {
- spin_lock(&swap_lock);
- *swap_map = 1;
- spin_unlock(&swap_lock);
- reset_overflow = 1;
- }
-
- /*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_unmap could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Note shmem_unuse already deleted a swappage from
- * the swap cache, unless the move to filepage failed:
- * in which case it left swappage in cache, lowered its
- * swap count to pass quickly through the loops above,
- * and now we must reincrement count to try again later.
- */
- if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
-
- swap_writepage(page, &wbc);
- lock_page(page);
- wait_on_page_writeback(page);
- }
- if (PageSwapCache(page)) {
- if (shmem)
- swap_duplicate(entry);
- else
- delete_from_swap_cache(page);
- }
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_list will preserve it.
- */
- SetPageDirty(page);
- unlock_page(page);
- page_cache_release(page);

/*
* Make sure that we aren't completely killing
@@ -879,10 +952,6 @@ static int try_to_unuse(unsigned int typ
}

mmput(start_mm);
- if (reset_overflow) {
- printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
- swap_overflow = 0;
- }
return retval;
}

Index: linux-2.6/mm/swap_state.c
===================================================================
--- linux-2.6.orig/mm/swap_state.c 2006-11-01 15:11:54.000000000 +1100
+++ linux-2.6/mm/swap_state.c 2006-11-01 15:51:20.000000000 +1100
@@ -131,6 +131,8 @@ void __delete_from_swap_cache(struct pag
radix_tree_delete(&swapper_space.page_tree, page_private(page));
set_page_private(page, 0);
ClearPageSwapCache(page);
+ if (unlikely(PageError(page)))
+ ClearPageError(page);
total_swapcache_pages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h 2006-11-01 15:54:10.000000000 +1100
+++ linux-2.6/include/linux/swap.h 2006-11-01 16:10:44.000000000 +1100
@@ -246,6 +246,7 @@ extern swp_entry_t get_swap_page_of_type
extern int swap_duplicate(swp_entry_t);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern void swap_free(swp_entry_t);
+extern void swap_free_markbad(swp_entry_t);
extern void free_swap_and_cache(swp_entry_t);
extern int swap_type_of(dev_t);
extern unsigned int count_swap_pages(int, int);
@@ -253,6 +254,7 @@ extern sector_t map_swap_page(struct swa
extern struct swap_info_struct *get_swap_info_struct(unsigned);
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);
+extern void try_to_unuse_page_entry(struct page *page);
struct backing_dev_info;

extern spinlock_t swap_lock;
Index: linux-2.6/mm/page_io.c
===================================================================
--- linux-2.6.orig/mm/page_io.c 2006-11-01 15:52:40.000000000 +1100
+++ linux-2.6/mm/page_io.c 2006-11-01 15:52:46.000000000 +1100
@@ -59,15 +59,12 @@ static int end_swap_bio_write(struct bio
* Re-dirty the page in order to avoid it being reclaimed.
* Also print a dire warning that things will go BAD (tm)
* very quickly.
- *
- * Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_sector);
- ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2006-11-01 15:53:29.000000000 +1100
+++ linux-2.6/mm/vmscan.c 2006-11-01 16:07:41.000000000 +1100
@@ -486,9 +486,21 @@ static unsigned long shrink_page_list(st
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page))
- if (!add_to_swap(page, GFP_ATOMIC))
- goto activate_locked;
+ if (PageAnon(page)) {
+ /*
+ * Encountered an error last time? Try to remove the
+ * page from its current position, which will notice
+ * the error and mark that swap entry bad. Then we can
+ * try allocating another swap entry.
+ */
+ if (PageSwapCache(page) && unlikely(PageError(page)))
+ try_to_unuse_page_entry(page);
+
+ if (!PageSwapCache(page)) {
+ if (!add_to_swap(page, GFP_ATOMIC))
+ goto activate_locked;
+ }
+ }
#endif /* CONFIG_SWAP */

mapping = page_mapping(page);


Attachments:
mm-swap-fail.patch (16.39 kB)

2006-11-01 05:37:39

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

Richard Purdie wrote:
> On Sat, 2006-10-28 at 22:10 +1000, Nick Piggin wrote:

>>That isn't your only problem though, and we simply don't want to do
>>this (potentially expensive) unusing from interrupt context. Noting
>>the error and dealing with it in process context I think is the best
>>way to do it.
>
>
> The reasoning was that this circumstance should be extremely rare. If it
> happens, we have a hardware problem. Recovering from that hardware
> problem gracefully is more important than a slightly longer interrupt.
> But yes, process context would be nicer, *if* we can find a way to do
> it.

Also note that the current code *should* "gracefully" handle the failure.
In that, it will not reclaim the page on a write error, so it isn't going
to cause a data loss...

It's just that it currently results in unswappable pages.

Handling it more gracefully by allowing the page to be retried with another
swap entry is OK I guess, but given the added complexity, I'm not even sure
it is worthwhile.

Perhaps we should just do the ClearPageError in the try_to_unuse path,
because the sysadmin should take down that swap device on failure. So if a
new device is added, we want to be able to unpin the failed pages.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2006-11-01 09:24:20

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Wed, 2006-11-01 at 16:26 +1100, Nick Piggin wrote:
> > I can't work out the code path it happens in and until I do, I'm not
> > sure how I can track it down...
>
> Is your driver scribbling on the page memory when it encounters a write
> error, or is the SIGBUS coming from a subsequent pagefault attempt on
> that address? Stick a WARN_ON(1) in the VM_FAULT_SIGBUS case in
> arch/arm/mm/fault.c to check.

I'm 100% certain the driver doesn't touch the memory page. It would be a
serious problem if it did as I'd see see memory corruption in the cases
where the page wasn't read from disk but read from the swap cache (I
don't).

The error therefore has to be coming from a pagefault attempt on the
address...

> >>Still, something must be triggering it somewhere.
> >
> >
> > Something must be but I wish I knew what/where...
>
> Let's try to find out :)

I'll see if I can work it out...

> Yes, I mean the other side of the writepage, ie. when page reclaim is
> about to attempt to swap it out.
>
> The attached (very untested, in need of splitting up) patch attempts to
> solve these problems. Note that it is probably not going to prevent your
> SIGBUS, so that will have to be found and fixed individually.
>
> In the meantime, I'll run this through some testing when I get half a
> chance.

Right, this is a nicer way to do it. I would have preferred to do
something like this in the first place but didn't as I didn't think I
could use PageError as a marker...

I'll try and work out why PageError is causing the problems and also
give the patch some testing.

Thanks,

Richard

2006-11-01 09:32:55

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Wed, 2006-11-01 at 16:36 +1100, Nick Piggin wrote:
> Also note that the current code *should* "gracefully" handle the failure.
> In that, it will not reclaim the page on a write error, so it isn't going
> to cause a data loss...
>
> It's just that it currently results in unswappable pages.
>
> Handling it more gracefully by allowing the page to be retried with another
> swap entry is OK I guess, but given the added complexity, I'm not even sure
> it is worthwhile.
>
> Perhaps we should just do the ClearPageError in the try_to_unuse path,
> because the sysadmin should take down that swap device on failure. So if a
> new device is added, we want to be able to unpin the failed pages.

As I see it, ClearPageError in the try_to_unuse path is the correct
thing to do as once we've marked the swap page as bad, it can't retry to
the same swap page. There is nothing special about the failed pages
beyond that point so we don't need them marked.

Also, if we remove the error flag, the page is still probably on the
inactive list so other existing code is likely to take care of trying a
new write to a another swap entry? I didn't add code for this case for
that reason.

Regards,

Richard

2006-11-02 23:26:30

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Wed, 2006-11-01 at 16:26 +1100, Nick Piggin wrote:
> The attached (very untested, in need of splitting up) patch attempts to
> solve these problems. Note that it is probably not going to prevent your
> SIGBUS, so that will have to be found and fixed individually.
>
> In the meantime, I'll run this through some testing when I get half a
> chance.
>
> plain text document attachment (mm-swap-fail.patch)
> Notice swap write errors during page reclaim, and deallocate the swap entry
> which is backing the swapcache. This allows the page error to be cleared and
> the page be allocated to a new entry, rather than the page to becoming pinned
> forever.
>
> Based on code from Richard Purdie <[email protected]>

For reference, I've done some testing with this patch applied and as
soon as I see write errors, processes get jammed in the D state
ultimately resulting in a system lock :-(. I'll see if I can track the
problem down.

I can't seem to reproduce the page error causing bus faults under
current kernels which is strange as it definitely used to happen. I'm
suspecting some kind of broken testing environment was causing it...

Cheers,

Richard

2006-12-13 12:13:17

by Richard Purdie

[permalink] [raw]
Subject: Re: [PATCH, RFC/T] Fix handling of write failures to swap devices

On Wed, 2006-11-01 at 16:26 +1100, Nick Piggin wrote:
> The attached (very untested, in need of splitting up) patch attempts to
> solve these problems. Note that it is probably not going to prevent your
> SIGBUS, so that will have to be found and fixed individually.
>
> In the meantime, I'll run this through some testing when I get half a
> chance.
>
> plain text document attachment (mm-swap-fail.patch)
> Notice swap write errors during page reclaim, and deallocate the swap entry
> which is backing the swapcache. This allows the page error to be cleared and
> the page be allocated to a new entry, rather than the page to becoming pinned
> forever.

I finally got around to doing some debugging on this patch
(http://lkml.org/lkml/2006/11/1/7). Firstly, I can't seem to reproduce
the SIGBUS issues I was seeing so I'm assuming that was some bug I
introduced somewhere along the line and/or something fixed in mainline.
The patch below is a diff on top of the patch linked to above so you can
see what I needed to change to make it work.

The patch you proposed locked upon entry to try_to_unuse_entry() as
shrink_page_list() already had the page locked. I therefore moved the
locking to try_to_unuse() but this is complicated by the retry loop. The
solution below handles this but is slightly ugly as it has the "magic"
locking sequence in two places and I'm not sure where the associated
comment should go. An alternative might be to return -EAGAIN and have
the caller perform the loop for us but I'm not keen on having to
replicate this to all call sites. We could also just check if the page
was locked by the caller? Whats the best way to handle this?

The refcounting was also broken which I solved by moving
page_cache_release() to try_to_unuse() with the locking. Alternatively,
should try_to_unuse_page_entry() increase some refcount?

I added a sanity check to make sure that if we unused a page and it was
no longer in swapcache, its dirty bit was cleared. I suspect this isn't
actually needed but should be harmless.

I also moved the pages with error check outside PageAnon(page) as
otherwise you can see IO loops where it continually tries to swap out a
page with errors.

With these changes, the patch has coped fine with the testing I've
exposed it to where entire swap devices sudden stopped accepting writes.
It happily marked every page in the swap partition bad with no
instabilities.

If we can agree a way to handle the page locking, I think this patch
might be ready for -mm? I'm happy to split it into a small series and
submit it...

Richard


Index: linux-2.6.19/mm/swapfile.c
===================================================================
--- linux-2.6.19.orig/mm/swapfile.c 2006-12-13 11:23:34.000000000 +0000
+++ linux-2.6.19/mm/swapfile.c 2006-12-13 11:21:11.000000000 +0000
@@ -669,9 +669,6 @@ static int try_to_unuse_entry(swp_entry_
* tests, do_swap_page and try_to_unuse repeatedly compete.
*/
retry:
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
wait_on_page_writeback(page);

/*
@@ -729,11 +726,8 @@ retry:
mmput(start_mm);
start_mm = new_start_mm;
}
- if (retval) {
- unlock_page(page);
- page_cache_release(page);
- goto out;
- }
+ if (retval)
+ return retval;

/*
* How could swap count reach 0x7fff when the maximum pid is 0x7fff,
@@ -778,6 +772,9 @@ retry:
};

swap_writepage(page, &wbc);
+ wait_on_page_locked(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
goto retry;
}
if (!shmem) {
@@ -801,15 +798,11 @@ retry:
* will preserve it.
*/
SetPageDirty(page);
- unlock_page(page);
- page_cache_release(page);
-
if (start_mm_p)
*start_mm_p = start_mm;
else
mmput(start_mm);

-out:
return retval;
}

@@ -836,6 +829,8 @@ void try_to_unuse_page_entry(struct page
BUG_ON(*swap_map == SWAP_MAP_BAD);

try_to_unuse_entry(entry, swap_map, page, NULL);
+ if (!PageSwapCache(page))
+ ClearPageDirty(page);
}

/*
@@ -941,8 +936,16 @@ static int try_to_unuse(unsigned int typ
break;
}

+ wait_on_page_locked(page);
+ wait_on_page_writeback(page);
+ lock_page(page);
+
retval = try_to_unuse_entry(entry, swap_map, page,
&start_mm);
+
+ page_cache_release(page);
+ unlock_page(page);
+
if (retval)
break;

Index: linux-2.6.19/mm/vmscan.c
===================================================================
--- linux-2.6.19.orig/mm/vmscan.c 2006-12-13 11:23:34.000000000 +0000
+++ linux-2.6.19/mm/vmscan.c 2006-12-13 11:18:28.000000000 +0000
@@ -489,24 +489,22 @@ static unsigned long shrink_page_list(st

#ifdef CONFIG_SWAP
/*
+ * Encountered an error last time? Try to remove the
+ * page from its current position, which will notice
+ * the error and mark that swap entry bad. Then we can
+ * try allocating another swap entry.
+ */
+ if (PageSwapCache(page) && unlikely(PageError(page)))
+ try_to_unuse_page_entry(page);
+
+ /*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page)) {
- /*
- * Encountered an error last time? Try to remove the
- * page from its current position, which will notice
- * the error and mark that swap entry bad. Then we can
- * try allocating another swap entry.
- */
- if (PageSwapCache(page) && unlikely(PageError(page)))
- try_to_unuse_page_entry(page);
+ if (PageAnon(page) && !PageSwapCache(page)
+ && !add_to_swap(page, GFP_ATOMIC))
+ goto activate_locked;

- if (!PageSwapCache(page)) {
- if (!add_to_swap(page, GFP_ATOMIC))
- goto activate_locked;
- }
- }
#endif /* CONFIG_SWAP */

mapping = page_mapping(page);