2002-03-22 18:31:33

by Paul Clements

[permalink] [raw]
Subject: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

diff -urN linux-2.4.18.PRISTINE/drivers/md/raid1.c linux-2.4.18/drivers/md/raid1.c
--- linux-2.4.18.PRISTINE/drivers/md/raid1.c Fri Mar 22 12:53:32 2002
+++ linux-2.4.18/drivers/md/raid1.c Fri Mar 22 13:05:05 2002
@@ -11,6 +11,8 @@
*
* Fixes to reconstruction by Jakob ?stergaard" <[email protected]>
* Various fixes by Neil Brown <[email protected]>
+ * Various fixes by Paul Clements <[email protected]> and
+ * James Bottomley <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -64,7 +66,7 @@

while(cnt) {
struct buffer_head *t;
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
while (cnt) {
t = conf->freebh;
@@ -75,7 +77,7 @@
conf->freebh_cnt--;
cnt--;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
if (cnt == 0)
break;
t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
@@ -98,7 +100,7 @@
static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
{
unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
while (bh) {
struct buffer_head *t = bh;
bh=bh->b_next;
@@ -110,7 +112,7 @@
conf->freebh_cnt++;
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
wake_up(&conf->wait_buffer);
}

@@ -124,12 +126,12 @@
bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
if (!bh) break;

- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
bh->b_pprev = &conf->freebh;
bh->b_next = conf->freebh;
conf->freebh = bh;
conf->freebh_cnt++;
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);

i++;
}
@@ -140,14 +142,14 @@
{
/* discard all buffer_heads */

- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
while (conf->freebh) {
struct buffer_head *bh = conf->freebh;
conf->freebh = bh->b_next;
kmem_cache_free(bh_cachep, bh);
conf->freebh_cnt--;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
}


@@ -156,7 +158,7 @@
struct raid1_bh *r1_bh = NULL;

do {
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
if (!conf->freer1_blocked && conf->freer1) {
r1_bh = conf->freer1;
conf->freer1 = r1_bh->next_r1;
@@ -165,7 +167,7 @@
r1_bh->state = (1 << R1BH_PreAlloc);
r1_bh->bh_req.b_state = 0;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
if (r1_bh)
return r1_bh;
r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
@@ -191,11 +193,11 @@

if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
r1_bh->next_r1 = conf->freer1;
conf->freer1 = r1_bh;
conf->freer1_cnt++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
/* don't need to wakeup wait_buffer because
* raid1_free_bh below will do that
*/
@@ -226,14 +228,14 @@

static void raid1_shrink_r1bh(raid1_conf_t *conf)
{
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
while (conf->freer1) {
struct raid1_bh *r1_bh = conf->freer1;
conf->freer1 = r1_bh->next_r1;
conf->freer1_cnt--;
kfree(r1_bh);
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
}


@@ -245,10 +247,10 @@
raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
r1_bh->mirror_bh_list = NULL;

- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
r1_bh->next_r1 = conf->freebuf;
conf->freebuf = r1_bh;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
raid1_free_bh(conf, bh);
}

@@ -256,12 +258,12 @@
{
struct raid1_bh *r1_bh;

- md_spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->memory_lock);
r1_bh = conf->freebuf;
conf->freebuf = r1_bh->next_r1;
r1_bh->next_r1= NULL;
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);

return r1_bh;
}
@@ -269,17 +271,18 @@
static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
{
int i = 0;
+ unsigned long flags;

- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irqsave(&conf->memory_lock, flags);
while (i < cnt) {
struct raid1_bh *r1_bh;
struct page *page;

- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_ATOMIC);
if (!page)
break;

- r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_ATOMIC);
if (!r1_bh) {
__free_page(page);
break;
@@ -291,20 +294,21 @@
conf->freebuf = r1_bh;
i++;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irqrestore(&conf->memory_lock, flags);
return i;
}

static void raid1_shrink_buffers (raid1_conf_t *conf)
{
- md_spin_lock_irq(&conf->device_lock);
+ unsigned long flags;
+ md_spin_lock_irqsave(&conf->memory_lock, flags);
while (conf->freebuf) {
struct raid1_bh *r1_bh = conf->freebuf;
conf->freebuf = r1_bh->next_r1;
__free_page(r1_bh->bh_req.b_page);
kfree(r1_bh);
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irqrestore(&conf->memory_lock, flags);
}

static int raid1_map (mddev_t *mddev, kdev_t *rdev)
@@ -723,7 +727,7 @@

#define DISK_FAILED KERN_ALERT \
"raid1: Disk failure on %s, disabling device. \n" \
-" Operation continuing on %d devices\n"
+" Operation continuing on %d devices\n"

#define START_SYNCING KERN_ALERT \
"raid1: start syncing spare disk.\n"
@@ -815,26 +819,33 @@
static void close_sync(raid1_conf_t *conf)
{
mddev_t *mddev = conf->mddev;
+ unsigned long flags;
/* If reconstruction was interrupted, we need to close the "active" and "pending"
* holes.
* we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
*/
/* this is really needed when recovery stops too... */
- spin_lock_irq(&conf->segment_lock);
+ spin_lock_irqsave(&conf->segment_lock, flags);
conf->start_active = conf->start_pending;
conf->start_ready = conf->start_pending;
wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
- conf->start_future = mddev->sb->size+1;
- conf->cnt_pending = conf->cnt_future;
- conf->cnt_future = 0;
- conf->phase = conf->phase ^1;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ /* have to be careful here - we want phase to go to 0 (although,
+ * this isn't really necessary since it's just a toggle) - but we
+ * cannot be sure of the current phase value */
+ if (conf->phase) {
+ /* close the future window and wait for pending I/O to finish */
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future;
+ conf->start_future = mddev->sb->size+1;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ conf->phase = conf->phase ^1;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ }
+ /* open the future window across the whole device, closing the others */
conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
- conf->phase = 0;
- conf->cnt_future = conf->cnt_done;;
+ conf->cnt_future += conf->cnt_done;
conf->cnt_done = 0;
- spin_unlock_irq(&conf->segment_lock);
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
wake_up(&conf->wait_done);
}

@@ -1347,6 +1358,7 @@
conf->window = buffs*(PAGE_SIZE>>9)/2;
conf->cnt_future += conf->cnt_done+conf->cnt_pending;
conf->cnt_done = conf->cnt_pending = 0;
+ wake_up(&conf->wait_ready);
if (conf->cnt_ready || conf->cnt_active)
MD_BUG();
}
@@ -1614,7 +1626,7 @@
conf->nr_disks = sb->nr_disks;
conf->mddev = mddev;
conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
-
+ conf->memory_lock = MD_SPIN_LOCK_UNLOCKED;
conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
init_waitqueue_head(&conf->wait_buffer);
init_waitqueue_head(&conf->wait_done);
diff -urN linux-2.4.18.PRISTINE/include/linux/raid/raid1.h linux-2.4.18/include/linux/raid/raid1.h
--- linux-2.4.18.PRISTINE/include/linux/raid/raid1.h Sun Aug 12 15:39:02 2001
+++ linux-2.4.18/include/linux/raid/raid1.h Fri Mar 22 13:03:53 2002
@@ -33,6 +33,9 @@
int resync_mirrors;
struct mirror_info *spare;
md_spinlock_t device_lock;
+ md_spinlock_t memory_lock;/* use this to protect
+ * memory (de)allocations instead
+ * of using the device_lock */

/* buffer pool */
/* buffer_heads that we have pre-allocated have b_pprev -> &freebh


Attachments:
raid1_2_4_18_official_patch.diff (9.00 kB)

2002-03-24 22:40:29

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

On Friday March 22, [email protected] wrote:
>
> The following patch addresses several critical problems in the raid1
> driver. Please apply.

I would have thought that the "please apply" request should wait until
*after* you have asked for an received comments...

> We've tested these patches pretty thoroughly
> against the 2.4.9-21 Red Hat kernel.

"We've tested" is a long way different to "these patches have been
around for a while, and several people have used them without
problems, and there has been independant review" which I would have
thought was the benchmark for the "stable" kernel.

> The source for raid1 is nearly
> identical from 2.4.9-21 to 2.4.18. The patch is against 2.4.18. If you
> have any questions, concerns, or comments, please send them to me.
>
......
>
> The problems are, briefly:
>
> 1) overuse of device_lock spin lock
>
> The device_lock was being used for two separate, unrelated purposes.
> This was causing too much contention and caused a deadlock in one case.
> The device_lock will be split into two separate locks by introducing a
> new spin lock, "memory_lock".

I can believe that there could be extra contention because of the dual
use of this spin lock. Do you have lockmeter numbers at all?

However I cannot see how it would cause a deadlock. Could you please
give details?


>
> 2) non-atomic memory allocation
>
> Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
> raid1 driver were scheduled out while holding a spin lock, causing a
> deadlock to occur. Memory allocation during critical sections where a
> spin lock is held will be changed to atomic allocations.

You are definately right that we should not be calling kmalloc with a
spinlock held - my bad.
However I don't think your fix is ideal. The relevant code is
"raid1_grow_buffers" which allocates a bunch of buffers and attaches
them to the device structure.
The lock is only realy needed for the attachment. A better fix would
be to build a separate list, and then just claim the lock while
attaching that list to the structure.

>
> 3) incorrect enabling/disabling of interrupts during locking
>
> In several cases, the wrong spin_lock* macros were being used. There
> were a few cases where the irqsave/irqrestore versions of the macros
> were needed, but were not used. The symptom of these problems was that
> interrupts were enabled or disabled at inappropriate times, resulting in
> deadlocks.

I don't believe that this is true.
The save/restore versions are only needed if the code might be called
from interrupt context. However the routines where you made this
change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
are only ever called from process context, with interrupts enabled.
Or am I missing something?


>
> 4) incorrect setting of conf->cnt_future and conf->phase resync counters
>
> The symptoms of this problem were that, if I/O was occurring when a
> resync ended (or was aborted), the resync would hang and never complete.
> This eventually would cause all I/O to the md device to hang.

I'll have to look at this one a bit more closely. I'll let you know
what I think of it.

NeilBrown

2002-03-24 23:26:10

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

Neil Brown wrote:
>
> ...
> The save/restore versions are only needed if the code might be called
> from interrupt context.

Or if the caller may wish to keep interrupts disabled.

> However the routines where you made this
> change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
> are only ever called from process context, with interrupts enabled.
> Or am I missing something?

If those functions are always called with interrupts enabled then
no, you're not missing anything ;)

However a bare spin_unlock_irq() in a function means that
callers which wish to keep interrupts disabled are subtly
subverted. We've had bugs from this before.

So the irqrestore functions are much more robust. I believe
that they should be the default choice. The non-restore
versions should be viewed as a micro-optimised version,
to be used with caution. The additional expense of the save/restore
is quite tiny - 20-30 cycles, perhaps.

-

2002-03-25 18:35:28

by Paul Clements

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

On Sun, 24 Mar 2002, Andrew Morton wrote:

> However a bare spin_unlock_irq() in a function means that
> callers which wish to keep interrupts disabled are subtly
> subverted. We've had bugs from this before.

Yes, that was precisely what was happening in raid1. There were
"nested" spin_lock_irq() calls.

> So the irqrestore functions are much more robust. I believe
> that they should be the default choice. The non-restore
> versions should be viewed as a micro-optimised version,
> to be used with caution. The additional expense of the save/restore
> is quite tiny - 20-30 cycles, perhaps.

I was wondering about the performance of these. I was reluctant
to change all occurrences of spin_lock_irq() to the save/restore
versions, even though that seemed like the safest thing to do, so
I had to analyze every code path where spin_locks were involved
to see which ones absolutely needed to change...very tedious.

Thanks for the explanations.

--
Paul

2002-03-25 18:54:17

by Paul Clements

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

Neil,

Thanks for your feedback. Replies below...

--
Paul Clements
SteelEye Technology
[email protected]


On Mon, 25 Mar 2002, Neil Brown wrote:

> On Friday March 22, [email protected] wrote:
> >
> > The problems are, briefly:
> >
> > 1) overuse of device_lock spin lock
> >
> > The device_lock was being used for two separate, unrelated purposes.
> > This was causing too much contention and caused a deadlock in one case.
> > The device_lock will be split into two separate locks by introducing a
> > new spin lock, "memory_lock".
>
> I can believe that there could be extra contention because of the dual
> use of this spin lock. Do you have lockmeter numbers at all?

No, I'm not familiar with that. How do I get those? Is it fairly simple?

I wasn't so much concerned about extra contention as the (in my mind)
logical separation of these two different tasks, and the fact that
the lack of separation had led to a deadlock.


> However I cannot see how it would cause a deadlock. Could you please
> give details?

raid1_diskop() calls close_sync() -- close_sync() schedules itself out
to wait for pending I/O to quiesce so that the resync can end...
meanwhile #CPUs (in my case, 2) tasks enter into any of the memory
(de)allocation routines and spin on the device_lock forever...

> >
> > 2) non-atomic memory allocation
> >
> > Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
> > raid1 driver were scheduled out while holding a spin lock, causing a
> > deadlock to occur. Memory allocation during critical sections where a
> > spin lock is held will be changed to atomic allocations.
>
> You are definately right that we should not be calling kmalloc with a
> spinlock held - my bad.
> However I don't think your fix is ideal. The relevant code is
> "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> them to the device structure.
> The lock is only realy needed for the attachment. A better fix would
> be to build a separate list, and then just claim the lock while
> attaching that list to the structure.

Unfortunately, this won't work, because the segment_lock is also held
while this code is executing (see raid1_sync_request).


> >
> > 3) incorrect enabling/disabling of interrupts during locking
> >
> > In several cases, the wrong spin_lock* macros were being used. There
> > were a few cases where the irqsave/irqrestore versions of the macros
> > were needed, but were not used. The symptom of these problems was that
> > interrupts were enabled or disabled at inappropriate times, resulting in
> > deadlocks.
>
> I don't believe that this is true.
> The save/restore versions are only needed if the code might be called
> from interrupt context. However the routines where you made this
> change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
> are only ever called from process context, with interrupts enabled.
> Or am I missing something?

please see my other e-mail reply to Andrew Morton regarding this...


> >
> > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> >
> > The symptoms of this problem were that, if I/O was occurring when a
> > resync ended (or was aborted), the resync would hang and never complete.
> > This eventually would cause all I/O to the md device to hang.
>
> I'll have to look at this one a bit more closely. I'll let you know
> what I think of it.

OK. If you come up with something better, please let me know.

2002-03-26 01:50:51

by NeilBrown

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

On Monday March 25, [email protected] wrote:
> Neil,
>
> Thanks for your feedback. Replies below...
>

ditto :-)

> >
> > I can believe that there could be extra contention because of the dual
> > use of this spin lock. Do you have lockmeter numbers at all?
>
> No, I'm not familiar with that. How do I get those? Is it fairly
> simple?

I've never tried myself, so I don't know of simple it is. The relevant
home page seems to be:
http://oss.sgi.com/projects/lockmeter/

>
> I wasn't so much concerned about extra contention as the (in my mind)
> logical separation of these two different tasks, and the fact that
> the lack of separation had led to a deadlock.
>

Certainly these are logically distinct uses of the same lock, though
there are still closely related.
There is often a tension between splitting a lock to reduce
granularity and keeping the number of spinlocks to a minimum.
In general I would only split a lock if their was either a clear
semantic need, or measureable performance impact.

If a deadlock were a consequence of not splitting, that would be a
strong argument, but I think we can avoid the deadlock by other means.

>
> > However I cannot see how it would cause a deadlock. Could you please
> > give details?
>
> raid1_diskop() calls close_sync() -- close_sync() schedules itself out
> to wait for pending I/O to quiesce so that the resync can end...
> meanwhile #CPUs (in my case, 2) tasks enter into any of the memory
> (de)allocation routines and spin on the device_lock forever...

Ahhhh.... Thanks....
close_sync() definately shouldn't be called with a spinlock held. In
the patch below I have moved it out of the locked region.

> > You are definately right that we should not be calling kmalloc with a
> > spinlock held - my bad.
> > However I don't think your fix is ideal. The relevant code is
> > "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> > them to the device structure.
> > The lock is only realy needed for the attachment. A better fix would
> > be to build a separate list, and then just claim the lock while
> > attaching that list to the structure.
>
> Unfortunately, this won't work, because the segment_lock is also held
> while this code is executing (see raid1_sync_request).
>

Good point, thanks. I think this means that the call to
raid1_grow_buffers needs to be moved out from inside the locked
region. This is done in the following patch.

>
> > >
> > > 3) incorrect enabling/disabling of interrupts during locking
> > >
...
> >
> > I don't believe that this is true.
> > The save/restore versions are only needed if the code might be called
> > from interrupt context. However the routines where you made this
> > change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
> > are only ever called from process context, with interrupts enabled.
> > Or am I missing something?
>
> please see my other e-mail reply to Andrew Morton regarding this...

OK, I understand now. spin_lock_irq is being called while
spin_lock_irq is already in-force. I think this is best fixed by
moving calls outside of locked regions as mentioned above.


>
>
> > >
> > > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> > >
> > > The symptoms of this problem were that, if I/O was occurring when a
> > > resync ended (or was aborted), the resync would hang and never complete.
> > > This eventually would cause all I/O to the md device to hang.
> >
> > I'll have to look at this one a bit more closely. I'll let you know
> > what I think of it.
>
> OK. If you come up with something better, please let me know.

OK, I've had a look, and I see what the problem is:

conf->start_future = mddev->sb->size+1;

start_future is in sectors. sb->size is in Kibibytes :-(
Should be
conf->start_future = (mddev->sb->size<<1)+1;

This error would explain your symptom.


Below is a patch which I believe should address all of your symptoms
and the bugs that they expose. If your are able to test it and let me
know how it works for you I would appreciate it.

Thanks
NeilBrown



----------- Diffstat output ------------
./drivers/md/raid1.c | 54 +++++++++++++++++++++++++++++++++++++--------------
1 files changed, 40 insertions(+), 14 deletions(-)

--- ./drivers/md/raid1.c 2002/03/25 21:53:59 1.1
+++ ./drivers/md/raid1.c 2002/03/26 01:47:56 1.2
@@ -269,8 +269,9 @@
static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
{
int i = 0;
+ struct raid1_bh *head = NULL, **tail;
+ tail = &head;

- md_spin_lock_irq(&conf->device_lock);
while (i < cnt) {
struct raid1_bh *r1_bh;
struct page *page;
@@ -287,10 +288,18 @@
memset(r1_bh, 0, sizeof(*r1_bh));
r1_bh->bh_req.b_page = page;
r1_bh->bh_req.b_data = page_address(page);
- r1_bh->next_r1 = conf->freebuf;
- conf->freebuf = r1_bh;
+ *tail = r1_bh;
+ r1_bh->next_r1 = NULL;
+ tail = & r1_bh->next_r1;
i++;
}
+ /* this lock probably isn't needed, as at the time when
+ * we are allocating buffers, nobody else will be touching the
+ * freebuf list. But it doesn't hurt....
+ */
+ md_spin_lock_irq(&conf->device_lock);
+ *tail = conf->freebuf;
+ conf->freebuf = head;
md_spin_unlock_irq(&conf->device_lock);
return i;
}
@@ -825,7 +834,7 @@
conf->start_ready = conf->start_pending;
wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
- conf->start_future = mddev->sb->size+1;
+ conf->start_future = (mddev->sb->size<<1)+1;
conf->cnt_pending = conf->cnt_future;
conf->cnt_future = 0;
conf->phase = conf->phase ^1;
@@ -849,6 +858,14 @@
mdk_rdev_t *spare_rdev, *failed_rdev;

print_raid1_conf(conf);
+
+ switch (state) {
+ case DISKOP_SPARE_ACTIVE:
+ case DISKOP_SPARE_INACTIVE:
+ /* need to wait for pending sync io before locking device */
+ close_sync(conf);
+ }
+
md_spin_lock_irq(&conf->device_lock);
/*
* find the disk ...
@@ -951,7 +968,11 @@
* Deactivate a spare disk:
*/
case DISKOP_SPARE_INACTIVE:
- close_sync(conf);
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
sdisk = conf->mirrors + spare_disk;
sdisk->operational = 0;
sdisk->write_only = 0;
@@ -964,7 +985,11 @@
* property)
*/
case DISKOP_SPARE_ACTIVE:
- close_sync(conf);
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
sdisk = conf->mirrors + spare_disk;
fdisk = conf->mirrors + failed_disk;

@@ -1328,23 +1353,25 @@
int bsize;
int disk;
int block_nr;
+ int buffs;

+ if (!sector_nr) {
+ /* we want enough buffers to hold twice the window of 128*/
+ buffs = 128 *2 / (PAGE_SIZE>>9);
+ buffs = raid1_grow_buffers(conf, buffs);
+ if (buffs < 2)
+ goto nomem;
+ conf->window = buffs*(PAGE_SIZE>>9)/2;
+ }
spin_lock_irq(&conf->segment_lock);
if (!sector_nr) {
/* initialize ...*/
- int buffs;
conf->start_active = 0;
conf->start_ready = 0;
conf->start_pending = 0;
conf->start_future = 0;
conf->phase = 0;
- /* we want enough buffers to hold twice the window of 128*/
- buffs = 128 *2 / (PAGE_SIZE>>9);
- buffs = raid1_grow_buffers(conf, buffs);
- if (buffs < 2)
- goto nomem;

- conf->window = buffs*(PAGE_SIZE>>9)/2;
conf->cnt_future += conf->cnt_done+conf->cnt_pending;
conf->cnt_done = conf->cnt_pending = 0;
if (conf->cnt_ready || conf->cnt_active)
@@ -1429,7 +1456,6 @@

nomem:
raid1_shrink_buffers(conf);
- spin_unlock_irq(&conf->segment_lock);
return -ENOMEM;
}

2002-03-26 22:07:13

by Paul Clements

[permalink] [raw]
Subject: Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

Neil,

I got a chance to test your patch for several hours today.
It looks like all the problems are fixed.

Thanks,
Paul

--
Paul Clements
SteelEye Technology
[email protected]


On Tue, 26 Mar 2002, Neil Brown wrote:

> On Monday March 25, [email protected] wrote:
> > Neil,
> >
> > Thanks for your feedback. Replies below...
> >
>
> ditto :-)
>
> > >
> > > I can believe that there could be extra contention because of the dual
> > > use of this spin lock. Do you have lockmeter numbers at all?
> >
> > No, I'm not familiar with that. How do I get those? Is it fairly
> > simple?
>
> I've never tried myself, so I don't know of simple it is. The relevant
> home page seems to be:
> http://oss.sgi.com/projects/lockmeter/
>
> >
> > I wasn't so much concerned about extra contention as the (in my mind)
> > logical separation of these two different tasks, and the fact that
> > the lack of separation had led to a deadlock.
> >
>
> Certainly these are logically distinct uses of the same lock, though
> there are still closely related.
> There is often a tension between splitting a lock to reduce
> granularity and keeping the number of spinlocks to a minimum.
> In general I would only split a lock if their was either a clear
> semantic need, or measureable performance impact.
>
> If a deadlock were a consequence of not splitting, that would be a
> strong argument, but I think we can avoid the deadlock by other means.
>
> >
> > > However I cannot see how it would cause a deadlock. Could you please
> > > give details?
> >
> > raid1_diskop() calls close_sync() -- close_sync() schedules itself out
> > to wait for pending I/O to quiesce so that the resync can end...
> > meanwhile #CPUs (in my case, 2) tasks enter into any of the memory
> > (de)allocation routines and spin on the device_lock forever...
>
> Ahhhh.... Thanks....
> close_sync() definately shouldn't be called with a spinlock held. In
> the patch below I have moved it out of the locked region.
>
> > > You are definately right that we should not be calling kmalloc with a
> > > spinlock held - my bad.
> > > However I don't think your fix is ideal. The relevant code is
> > > "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> > > them to the device structure.
> > > The lock is only realy needed for the attachment. A better fix would
> > > be to build a separate list, and then just claim the lock while
> > > attaching that list to the structure.
> >
> > Unfortunately, this won't work, because the segment_lock is also held
> > while this code is executing (see raid1_sync_request).
> >
>
> Good point, thanks. I think this means that the call to
> raid1_grow_buffers needs to be moved out from inside the locked
> region. This is done in the following patch.
>
> >
> > > >
> > > > 3) incorrect enabling/disabling of interrupts during locking
> > > >
> ...
> > >
> > > I don't believe that this is true.
> > > The save/restore versions are only needed if the code might be called
> > > from interrupt context. However the routines where you made this
> > > change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
> > > are only ever called from process context, with interrupts enabled.
> > > Or am I missing something?
> >
> > please see my other e-mail reply to Andrew Morton regarding this...
>
> OK, I understand now. spin_lock_irq is being called while
> spin_lock_irq is already in-force. I think this is best fixed by
> moving calls outside of locked regions as mentioned above.
>
>
> >
> >
> > > >
> > > > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> > > >
> > > > The symptoms of this problem were that, if I/O was occurring when a
> > > > resync ended (or was aborted), the resync would hang and never complete.
> > > > This eventually would cause all I/O to the md device to hang.
> > >
> > > I'll have to look at this one a bit more closely. I'll let you know
> > > what I think of it.
> >
> > OK. If you come up with something better, please let me know.
>
> OK, I've had a look, and I see what the problem is:
>
> conf->start_future = mddev->sb->size+1;
>
> start_future is in sectors. sb->size is in Kibibytes :-(
> Should be
> conf->start_future = (mddev->sb->size<<1)+1;
>
> This error would explain your symptom.
>
>
> Below is a patch which I believe should address all of your symptoms
> and the bugs that they expose. If your are able to test it and let me
> know how it works for you I would appreciate it.
>
> Thanks
> NeilBrown
>
>
>
> ----------- Diffstat output ------------
> ./drivers/md/raid1.c | 54 +++++++++++++++++++++++++++++++++++++--------------
> 1 files changed, 40 insertions(+), 14 deletions(-)
>
> --- ./drivers/md/raid1.c 2002/03/25 21:53:59 1.1
> +++ ./drivers/md/raid1.c 2002/03/26 01:47:56 1.2
> @@ -269,8 +269,9 @@
> static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
> {
> int i = 0;
> + struct raid1_bh *head = NULL, **tail;
> + tail = &head;
>
> - md_spin_lock_irq(&conf->device_lock);
> while (i < cnt) {
> struct raid1_bh *r1_bh;
> struct page *page;
> @@ -287,10 +288,18 @@
> memset(r1_bh, 0, sizeof(*r1_bh));
> r1_bh->bh_req.b_page = page;
> r1_bh->bh_req.b_data = page_address(page);
> - r1_bh->next_r1 = conf->freebuf;
> - conf->freebuf = r1_bh;
> + *tail = r1_bh;
> + r1_bh->next_r1 = NULL;
> + tail = & r1_bh->next_r1;
> i++;
> }
> + /* this lock probably isn't needed, as at the time when
> + * we are allocating buffers, nobody else will be touching the
> + * freebuf list. But it doesn't hurt....
> + */
> + md_spin_lock_irq(&conf->device_lock);
> + *tail = conf->freebuf;
> + conf->freebuf = head;
> md_spin_unlock_irq(&conf->device_lock);
> return i;
> }
> @@ -825,7 +834,7 @@
> conf->start_ready = conf->start_pending;
> wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
> conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
> - conf->start_future = mddev->sb->size+1;
> + conf->start_future = (mddev->sb->size<<1)+1;
> conf->cnt_pending = conf->cnt_future;
> conf->cnt_future = 0;
> conf->phase = conf->phase ^1;
> @@ -849,6 +858,14 @@
> mdk_rdev_t *spare_rdev, *failed_rdev;
>
> print_raid1_conf(conf);
> +
> + switch (state) {
> + case DISKOP_SPARE_ACTIVE:
> + case DISKOP_SPARE_INACTIVE:
> + /* need to wait for pending sync io before locking device */
> + close_sync(conf);
> + }
> +
> md_spin_lock_irq(&conf->device_lock);
> /*
> * find the disk ...
> @@ -951,7 +968,11 @@
> * Deactivate a spare disk:
> */
> case DISKOP_SPARE_INACTIVE:
> - close_sync(conf);
> + if (conf->start_future > 0) {
> + MD_BUG();
> + err = -EBUSY;
> + break;
> + }
> sdisk = conf->mirrors + spare_disk;
> sdisk->operational = 0;
> sdisk->write_only = 0;
> @@ -964,7 +985,11 @@
> * property)
> */
> case DISKOP_SPARE_ACTIVE:
> - close_sync(conf);
> + if (conf->start_future > 0) {
> + MD_BUG();
> + err = -EBUSY;
> + break;
> + }
> sdisk = conf->mirrors + spare_disk;
> fdisk = conf->mirrors + failed_disk;
>
> @@ -1328,23 +1353,25 @@
> int bsize;
> int disk;
> int block_nr;
> + int buffs;
>
> + if (!sector_nr) {
> + /* we want enough buffers to hold twice the window of 128*/
> + buffs = 128 *2 / (PAGE_SIZE>>9);
> + buffs = raid1_grow_buffers(conf, buffs);
> + if (buffs < 2)
> + goto nomem;
> + conf->window = buffs*(PAGE_SIZE>>9)/2;
> + }
> spin_lock_irq(&conf->segment_lock);
> if (!sector_nr) {
> /* initialize ...*/
> - int buffs;
> conf->start_active = 0;
> conf->start_ready = 0;
> conf->start_pending = 0;
> conf->start_future = 0;
> conf->phase = 0;
> - /* we want enough buffers to hold twice the window of 128*/
> - buffs = 128 *2 / (PAGE_SIZE>>9);
> - buffs = raid1_grow_buffers(conf, buffs);
> - if (buffs < 2)
> - goto nomem;
>
> - conf->window = buffs*(PAGE_SIZE>>9)/2;
> conf->cnt_future += conf->cnt_done+conf->cnt_pending;
> conf->cnt_done = conf->cnt_pending = 0;
> if (conf->cnt_ready || conf->cnt_active)
> @@ -1429,7 +1456,6 @@
>
> nomem:
> raid1_shrink_buffers(conf);
> - spin_unlock_irq(&conf->segment_lock);
> return -ENOMEM;
> }
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>