Hi,
This series contains a few cleanup and minor fixes to the raid5 code
to make it a bit easier to hack on.
Patch 1 cleans up the error returns in setup_conf() (I had an
abandonded patch that added another error out and needed this clean
more sensible).
Patch 2 fixes a sparse warning with the raid5_percpu structure.
Patch 3 through 6 fixes sparse warnings related to missing __rcu
annotations when using the RCU calls.
Patch 7 just adds some basic __must_hold annotations for the device_lock
to any function that is called while holding the lock. Sparse doesn't
really check this, but the annotation makes the locks a little easier
to analyze.
Thanks,
Logan
--
Logan Gunthorpe (7):
md/raid5: Cleanup setup_conf() error returns
md/raid5: Un-nest struct raid5_percpu definition
md/raid5: Add __rcu annotation to struct disk_info
md/raid5: Annotate rdev/replacement accesses when nr_pending is
elevated
md/raid5: Annotate rdev/replacement access when mddev_lock is held
md/raid5-ppl: Annotate with rcu_dereference_protected()
md/raid5: Annotate functions that hold device_lock with __must_hold
drivers/md/raid5-ppl.c | 13 ++-
drivers/md/raid5.c | 179 ++++++++++++++++++++++++++---------------
drivers/md/raid5.h | 23 +++---
3 files changed, 139 insertions(+), 76 deletions(-)
base-commit: 3123109284176b1532874591f7c81f3837bbdc17
--
2.30.2
Be more careful about the error returns. Most errors in this function
are actually ENOMEM, but it forcibly returns EIO if conf has been
allocated.
Instead return ret and ensure it is set appropriately before each goto
abort.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 351d341a1ffa..c0e373a02d3a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7166,7 +7166,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt;
struct r5worker_group *new_group;
- int ret;
+ int ret = -ENOMEM;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -7225,6 +7225,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
+
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
@@ -7302,11 +7303,13 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
- if (raid5_alloc_percpu(conf) != 0)
+ ret = raid5_alloc_percpu(conf);
+ if (ret)
goto abort;
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
+ ret = -EIO;
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
@@ -7370,6 +7373,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (grow_stripes(conf, conf->min_nr_stripes)) {
pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
+ ret = -ENOMEM;
goto abort;
} else
pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
@@ -7383,7 +7387,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
- if (register_shrinker(&conf->shrinker)) {
+ ret = register_shrinker(&conf->shrinker);
+ if (ret) {
pr_warn("md/raid:%s: couldn't register shrinker.\n",
mdname(mddev));
goto abort;
@@ -7394,17 +7399,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (!conf->thread) {
pr_warn("md/raid:%s: couldn't allocate thread.\n",
mdname(mddev));
+ ret = -ENOMEM;
goto abort;
}
return conf;
abort:
- if (conf) {
+ if (conf)
free_conf(conf);
- return ERR_PTR(-EIO);
- } else
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
--
2.30.2
A handful of functions note the device_lock must be held with a comment
but this is not comprehensive. Many other functions hold the lock when
taken so add an __must_hold() to each call to annotate when the lock is
held.
This makes it a bit easier to analyse device_lock.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fa955d23c88f..b53000a917df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+ __acquires(&conf->device_lock)
{
spin_lock_irq(conf->hash_locks + hash);
spin_lock(&conf->device_lock);
}
static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+ __releases(&conf->device_lock)
{
spin_unlock(&conf->device_lock);
spin_unlock_irq(conf->hash_locks + hash);
}
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+ __acquires(&conf->device_lock)
{
int i;
spin_lock_irq(conf->hash_locks);
@@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
}
static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+ __releases(&conf->device_lock)
{
int i;
spin_unlock(&conf->device_lock);
@@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh)
}
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
+ __must_hold(&sh->raid_conf->device_lock)
{
struct r5conf *conf = sh->raid_conf;
struct r5worker_group *group;
@@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
int i;
int injournal = 0; /* number of date pages with R5_InJournal */
@@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
if (atomic_dec_and_test(&sh->count))
do_release_stripe(conf, sh, temp_inactive_list);
@@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf,
}
}
-/* should hold conf->device_lock already */
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *t;
int count = 0;
@@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
+ *
+ * Most calls to this function hold &conf->device_lock. Calls
+ * in raid5_run() do not require the lock as no other threads
+ * have been started yet.
*/
int raid5_calc_degraded(struct r5conf *conf)
{
@@ -5278,6 +5289,7 @@ static void handle_stripe(struct stripe_head *sh)
}
static void raid5_activate_delayed(struct r5conf *conf)
+ __must_hold(&conf->device_lock)
{
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
while (!list_empty(&conf->delayed_list)) {
@@ -5295,9 +5307,9 @@ static void raid5_activate_delayed(struct r5conf *conf)
}
static void activate_bit_delay(struct r5conf *conf,
- struct list_head *temp_inactive_list)
+ struct list_head *temp_inactive_list)
+ __must_hold(&conf->device_lock)
{
- /* device_lock is held */
struct list_head head;
list_add(&head, &conf->bitmap_list);
list_del_init(&conf->bitmap_list);
@@ -5522,6 +5534,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
* handle_list.
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
@@ -6393,8 +6406,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
- __releases(&conf->device_lock)
- __acquires(&conf->device_lock)
+ __must_hold(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;
--
2.30.2
There are a number of accesses to __rcu variables that should be safe
because nr_pending in the disk is known to be elevated.
Create a wrapper around rcu_dereference_protected() to annotate these
accesses and verify that nr_pending is non-zero.
This fixes a number of sparse warnings.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.c | 28 +++++++++++++++++++---------
1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4815f5351818..0f29a2769cb3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2648,6 +2648,16 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
+/*
+ * This helper wraps rcu_dereference_protected() and can be used when
+ * it is known that the nr_pending of the rdev is elevated.
+ */
+static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ atomic_read(&rcu_access_pointer(rdev)->nr_pending));
+}
+
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
@@ -2674,9 +2684,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
@@ -2790,11 +2800,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
if (bi == &sh->dev[i].rreq) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (rdev)
replacement = 1;
else
@@ -2802,7 +2812,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed
* until all requests are finished.
*/
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
}
@@ -5213,23 +5223,23 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
- rdev = conf->disks[i].replacement;
+ rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
/* rdev have been moved down */
- rdev = conf->disks[i].rdev;
+ rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
--
2.30.2
rdev and replacement are protected in some circumstances with
rcu_dereference and synchronize_rcu (in raid5_remove_disk()). However,
they were not annotated with __rcu so a sparse warning is emitted for
every rcu_dereference() call.
Add the __rcu annotation and fix up the initialization with
RCU_INIT_POINTER, all pointer modifications with rcu_assign_pointer(),
a few cases where the pointer value is tested with rcu_access_pointer()
and one case where READ_ONCE() is used instead of rcu_dereference(),
a case in print_raid5_conf() that should have rcu_dereference() and
rcu_read_[un]lock() calls.
Additional sparse issues will be fixed up in further commits.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.c | 46 ++++++++++++++++++++++++++--------------------
drivers/md/raid5.h | 3 ++-
2 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c0e373a02d3a..4815f5351818 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6288,7 +6288,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
*/
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
- struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
+ struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
@@ -7320,11 +7320,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
- disk->replacement = rdev;
+ RCU_INIT_POINTER(disk->replacement, rdev);
} else {
if (disk->rdev)
goto abort;
- disk->rdev = rdev;
+ RCU_INIT_POINTER(disk->rdev, rdev);
}
if (test_bit(In_sync, &rdev->flags)) {
@@ -7631,11 +7631,11 @@ static int raid5_run(struct mddev *mddev)
rdev = conf->disks[i].replacement;
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
- conf->disks[i].rdev = rdev;
+ rcu_assign_pointer(conf->disks[i].rdev, rdev);
}
if (!rdev)
continue;
- if (conf->disks[i].replacement &&
+ if (rcu_access_pointer(conf->disks[i].replacement) &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
@@ -7836,8 +7836,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
static void print_raid5_conf (struct r5conf *conf)
{
+ struct md_rdev *rdev;
int i;
- struct disk_info *tmp;
pr_debug("RAID conf printout:\n");
if (!conf) {
@@ -7848,14 +7848,16 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
+ rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
- tmp = conf->disks + i;
- if (tmp->rdev)
+ rdev = rcu_dereference(conf->disks[i].rdev);
+ if (rdev)
pr_debug(" disk %d, o:%d, dev:%s\n",
- i, !test_bit(Faulty, &tmp->rdev->flags),
- bdevname(tmp->rdev->bdev, b));
+ i, !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev, b));
}
+ rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
@@ -7906,8 +7908,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
- struct md_rdev **rdevp;
+ struct md_rdev __rcu **rdevp;
struct disk_info *p = conf->disks + number;
+ struct md_rdev *tmp;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
@@ -7925,9 +7928,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
log_exit(conf);
return 0;
}
- if (rdev == p->rdev)
+ if (rdev == rcu_access_pointer(p->rdev))
rdevp = &p->rdev;
- else if (rdev == p->replacement)
+ else if (rdev == rcu_access_pointer(p->replacement))
rdevp = &p->replacement;
else
return 0;
@@ -7947,7 +7950,8 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
- (!p->replacement || p->replacement == rdev) &&
+ (!rcu_access_pointer(p->replacement) ||
+ rcu_access_pointer(p->replacement) == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
@@ -7958,7 +7962,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
- *rdevp = rdev;
+ rcu_assign_pointer(*rdevp, rdev);
}
}
if (!err) {
@@ -7966,17 +7970,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err)
goto abort;
}
- if (p->replacement) {
+
+ tmp = rcu_access_pointer(p->replacement);
+ if (tmp) {
/* We must have just cleared 'rdev' */
- p->rdev = p->replacement;
- clear_bit(Replacement, &p->replacement->flags);
+ rcu_assign_pointer(p->rdev, tmp);
+ clear_bit(Replacement, &tmp->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
- p->replacement = NULL;
+ rcu_assign_pointer(p->replacement, NULL);
if (!err)
- err = log_modify(conf, p->rdev, true);
+ err = log_modify(conf, tmp, true);
}
clear_bit(WantReplacement, &rdev->flags);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 61bc2e1f1b4e..638d29863503 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -473,7 +473,8 @@ enum {
*/
struct disk_info {
- struct md_rdev *rdev, *replacement;
+ struct md_rdev __rcu *rdev;
+ struct md_rdev __rcu *replacement;
struct page *extra_page; /* extra page to use in prexor */
};
--
2.30.2
Sparse reports many warnings of the form:
drivers/md/raid5.c:1476:16: warning: dereference of noderef expression
This is because all struct raid5_percpu definitions get marked as
__percpu when really only the pointer in r5conf should have that
annotation.
Fix this by moving the defnition of raid5_precpu out of the definition
of struct r5conf.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.h | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9e8486a9e445..61bc2e1f1b4e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -560,6 +560,16 @@ struct r5pending_data {
struct bio_list bios;
};
+struct raid5_percpu {
+ struct page *spare_page; /* Used when checking P/Q in raid6 */
+ void *scribble; /* space for constructing buffer
+ * lists and performing address
+ * conversions
+ */
+ int scribble_obj_size;
+ local_lock_t lock;
+};
+
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
@@ -635,15 +645,7 @@ struct r5conf {
*/
int recovery_disabled;
/* per cpu variables */
- struct raid5_percpu {
- struct page *spare_page; /* Used when checking P/Q in raid6 */
- void *scribble; /* space for constructing buffer
- * lists and performing address
- * conversions
- */
- int scribble_obj_size;
- local_lock_t lock;
- } __percpu *percpu;
+ struct raid5_percpu __percpu *percpu;
int scribble_disks;
int scribble_sectors;
struct hlist_node node;
--
2.30.2
To suppress the last remaining sparse warnings about accessing
rdev, add rcu_dereference_protected calls to a couple places
in raid5-ppl. All of these places are called under raid5_run and
therefore are occurring before the array has started and is thus
safe.
There's no sensible check to do for the second argument of
rcu_dereference_protected() so a comment is added instead.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5-ppl.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d3962d92df18..55d065a87b89 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -883,7 +883,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)r_sector, dd_idx,
(unsigned long long)sector);
- rdev = conf->disks[dd_idx].rdev;
+ /* Array has not started so rcu dereference is safe */
+ rdev = rcu_dereference_protected(
+ conf->disks[dd_idx].rdev, 1);
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
@@ -934,7 +936,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
parity_sector = raid5_compute_sector(conf, r_sector_first + i,
0, &disk, &sh);
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
- parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ /* Array has not started so rcu dereference is safe */
+ parity_rdev = rcu_dereference_protected(
+ conf->disks[sh.pd_idx].rdev, 1);
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
@@ -1404,7 +1409,9 @@ int ppl_init_log(struct r5conf *conf)
for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i];
- struct md_rdev *rdev = conf->disks[i].rdev;
+ /* Array has not started so rcu dereference is safe */
+ struct md_rdev *rdev =
+ rcu_dereference_protected(conf->disks[i].rdev, 1);
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
--
2.30.2
The mddev_lock should be held during raid5_remove_disk() which is when
the rdev/replacement pointers are modified. So any access to these
pointers marked __rcu should be safe whenever the mddev_lock is held.
There are numerous such access that currently produce sparse warnings.
Add a helper function, rdev_mdlock_deref() that wraps
rcu_dereference_protected() in all these instances.
This annotation fixes a number of sparse warnings.
Signed-off-by: Logan Gunthorpe <[email protected]>
---
drivers/md/raid5.c | 65 ++++++++++++++++++++++++++++++----------------
1 file changed, 43 insertions(+), 22 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0f29a2769cb3..fa955d23c88f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2658,6 +2658,18 @@ static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
atomic_read(&rcu_access_pointer(rdev)->nr_pending));
}
+/*
+ * This helper wraps rcu_dereference_protected() and should be used
+ * when it is known that the mddev_lock() is held. This is safe
+ * seeing raid5_remove_disk() has the same lock held.
+ */
+static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
+ struct md_rdev __rcu *rdev)
+{
+ return rcu_dereference_protected(rdev,
+ lockdep_is_held(&mddev->reconfig_mutex));
+}
+
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
@@ -7635,10 +7647,11 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
- rdev = conf->disks[i].rdev;
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
- rdev = conf->disks[i].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
rcu_assign_pointer(conf->disks[i].rdev, rdev);
@@ -7874,36 +7887,38 @@ static int raid5_spare_active(struct mddev *mddev)
{
int i;
struct r5conf *conf = mddev->private;
- struct disk_info *tmp;
+ struct md_rdev *rdev, *replacement;
int count = 0;
unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
- tmp = conf->disks + i;
- if (tmp->replacement
- && tmp->replacement->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->replacement->flags)
- && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
+ rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
+ replacement = rdev_mdlock_deref(mddev,
+ conf->disks[i].replacement);
+ if (replacement
+ && replacement->recovery_offset == MaxSector
+ && !test_bit(Faulty, &replacement->flags)
+ && !test_and_set_bit(In_sync, &replacement->flags)) {
/* Replacement has just become active. */
- if (!tmp->rdev
- || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
+ if (!rdev
+ || !test_and_clear_bit(In_sync, &rdev->flags))
count++;
- if (tmp->rdev) {
+ if (rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
- set_bit(Faulty, &tmp->rdev->flags);
+ set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
- tmp->rdev->sysfs_state);
+ rdev->sysfs_state);
}
- sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
- } else if (tmp->rdev
- && tmp->rdev->recovery_offset == MaxSector
- && !test_bit(Faulty, &tmp->rdev->flags)
- && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
+ sysfs_notify_dirent_safe(replacement->sysfs_state);
+ } else if (rdev
+ && rdev->recovery_offset == MaxSector
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
- sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
@@ -7968,6 +7983,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
}
*rdevp = NULL;
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
+ lockdep_assert_held(&mddev->reconfig_mutex);
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
@@ -8008,6 +8024,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int ret, err = -EEXIST;
int disk;
struct disk_info *p;
+ struct md_rdev *tmp;
int first = 0;
int last = conf->raid_disks - 1;
@@ -8065,7 +8082,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
for (disk = first; disk <= last; disk++) {
p = conf->disks + disk;
- if (test_bit(WantReplacement, &p->rdev->flags) &&
+ tmp = rdev_mdlock_deref(mddev, p->rdev);
+ if (test_bit(WantReplacement, &tmp->flags) &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
@@ -8356,6 +8374,7 @@ static void end_reshape(struct r5conf *conf)
static void raid5_finish_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
+ struct md_rdev *rdev;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -8367,10 +8386,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
- struct md_rdev *rdev = conf->disks[d].rdev;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].rdev);
if (rdev)
clear_bit(In_sync, &rdev->flags);
- rdev = conf->disks[d].replacement;
+ rdev = rdev_mdlock_deref(mddev,
+ conf->disks[d].replacement);
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
--
2.30.2
On Thu, Apr 07, 2022 at 10:57:07AM -0600, Logan Gunthorpe wrote:
> Be more careful about the error returns. Most errors in this function
> are actually ENOMEM, but it forcibly returns EIO if conf has been
> allocated.
>
> Instead return ret and ensure it is set appropriately before each goto
> abort.
>
> Signed-off-by: Logan Gunthorpe <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
On Thu, Apr 07, 2022 at 10:57:08AM -0600, Logan Gunthorpe wrote:
> Sparse reports many warnings of the form:
> drivers/md/raid5.c:1476:16: warning: dereference of noderef expression
>
> This is because all struct raid5_percpu definitions get marked as
> __percpu when really only the pointer in r5conf should have that
> annotation.
>
> Fix this by moving the defnition of raid5_precpu out of the definition
> of struct r5conf.
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
Looks good:
Reviewed-by: Christoph Hellwig <[email protected]>
On Thu, Apr 7, 2022 at 9:57 AM Logan Gunthorpe <[email protected]> wrote:
>
> Hi,
>
> This series contains a few cleanup and minor fixes to the raid5 code
> to make it a bit easier to hack on.
>
> Patch 1 cleans up the error returns in setup_conf() (I had an
> abandonded patch that added another error out and needed this clean
> more sensible).
>
> Patch 2 fixes a sparse warning with the raid5_percpu structure.
>
> Patch 3 through 6 fixes sparse warnings related to missing __rcu
> annotations when using the RCU calls.
>
> Patch 7 just adds some basic __must_hold annotations for the device_lock
> to any function that is called while holding the lock. Sparse doesn't
> really check this, but the annotation makes the locks a little easier
> to analyze.
>
> Thanks,
>
> Logan
Applied to md-next. Thanks!
>
> --
>
> Logan Gunthorpe (7):
> md/raid5: Cleanup setup_conf() error returns
> md/raid5: Un-nest struct raid5_percpu definition
> md/raid5: Add __rcu annotation to struct disk_info
> md/raid5: Annotate rdev/replacement accesses when nr_pending is
> elevated
> md/raid5: Annotate rdev/replacement access when mddev_lock is held
> md/raid5-ppl: Annotate with rcu_dereference_protected()
> md/raid5: Annotate functions that hold device_lock with __must_hold
>
> drivers/md/raid5-ppl.c | 13 ++-
> drivers/md/raid5.c | 179 ++++++++++++++++++++++++++---------------
> drivers/md/raid5.h | 23 +++---
> 3 files changed, 139 insertions(+), 76 deletions(-)
>
>
> base-commit: 3123109284176b1532874591f7c81f3837bbdc17
> --
> 2.30.2