Hello,
This is a revised set of pathces which provides common
representation of dependencies between stacked devices (dm and md)
in sysfs.
Variants of bd_claim/bd_release are added to accept a kobject
and create symlinks between the claimed bdev and the holder.
dm/md will give a child of its gendisk kobject to bd_claim.
For example, if dm-0 maps to sda, we have the following symlinks;
/sys/block/dm-0/slaves/sda --> /sys/block/sda
/sys/block/sda/holders/dm-0 --> /sys/block/dm-0
Comments are welcome.
A few points I would appreciate comments/reviews from maintainers:
About sysfs
- I confirmed sysfs_remove_symlink() and kobject_del() don't
allocate memory in 2.6.15 and it seems true on the git head.
I would like to make sure it's true in future versions of kernel
because they are called during device-mapper's table swapping
where I/O to free memory could deadlock on the dm device.
What is the recommended way to do that?
Or can I just expect these functions will not allocate memory
in future versions of kernel?
About dm
- To get a reference to mapped_device, table_load() do
dm_get() before populating table. It will dm_put() when
the table is being discarded or the table is being activated.
About md
- Rather than carrying mddev pointer around, bd_claim is now
made twice. First is not changed at lock_rdev().
The second is at bind_rdev_to_array() where kobject is passed
and symlinks are created.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
This patch modifies md driver to call bd_claim_by_kobject
and bd_release_from_kobject.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
This patch adds bd_claim_by_kobject and bd_release_from_kobject
which create/remove symlinks between the claimed bdev and
the holder.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
This patch modifies dm driver to call bd_claim_by_kobject
and bd_release_from_kobject.
To do that, reference to the mapped_device is added in
dm_table.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
On Wed, Feb 22, 2006 at 11:13:06AM -0500, Jun'ichi Nomura wrote:
> This patch modifies dm driver to call bd_claim_by_kobject
> and bd_release_from_kobject.
> To do that, reference to the mapped_device is added in
> dm_table.
This patch needs splitting up so that independent changes can be
considered separately.
c.f. The proposal from Mike Anderson (repeated below) which I prefer
because it makes it clear that a table always belongs to exactly one md.
Exposing dm_table_set_md() suggests a table can have its owning md
changed - it can't.
Alasdair
--
[email protected]
This patch adds a mapped_device member to the dm_table struct.
Signed-off-by: Mike Anderson <[email protected]>
drivers/md/dm-ioctl.c | 32 +++++++++++++++++++-------------
drivers/md/dm-table.c | 12 +++++++++++-
drivers/md/dm.h | 4 +++-
3 files changed, 33 insertions(+), 15 deletions(-)
Index: sas-2.6-patched/drivers/md/dm.h
===================================================================
--- sas-2.6-patched.orig/drivers/md/dm.h 2006-02-20 01:05:32.000000000 -0800
+++ sas-2.6-patched/drivers/md/dm.h 2006-02-20 01:42:29.000000000 -0800
@@ -99,7 +99,8 @@ int dm_suspended(struct mapped_device *m
* Functions for manipulating a table. Tables are also reference
* counted.
*---------------------------------------------------------------*/
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
+int dm_table_create(struct dm_table **result, int mode, unsigned
+ num_targets, struct mapped_device *md);
void dm_table_get(struct dm_table *t);
void dm_table_put(struct dm_table *t);
@@ -123,6 +124,7 @@ void dm_table_resume_targets(struct dm_t
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
void dm_table_unplug_all(struct dm_table *t);
int dm_table_flush_all(struct dm_table *t);
+struct mapped_device *dm_table_get_md(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
Index: sas-2.6-patched/drivers/md/dm-table.c
===================================================================
--- sas-2.6-patched.orig/drivers/md/dm-table.c 2006-02-20 01:05:32.000000000 -0800
+++ sas-2.6-patched/drivers/md/dm-table.c 2006-02-20 01:42:29.000000000 -0800
@@ -33,6 +33,7 @@ struct dm_table {
unsigned int num_allocated;
sector_t *highs;
struct dm_target *targets;
+ struct mapped_device *md;
/*
* Indicates the rw permissions for the new logical
@@ -204,7 +205,8 @@ static int alloc_targets(struct dm_table
return 0;
}
-int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
+int dm_table_create(struct dm_table **result, int mode,
+ unsigned num_targets, struct mapped_device *md)
{
struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
@@ -227,6 +229,7 @@ int dm_table_create(struct dm_table **re
}
t->mode = mode;
+ t->md = md;
*result = t;
return 0;
}
@@ -945,6 +948,12 @@ int dm_table_flush_all(struct dm_table *
return ret;
}
+struct mapped_device *dm_table_get_md(struct dm_table *t)
+{
+ dm_get(t->md);
+ return t->md;
+}
+
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
@@ -955,3 +964,4 @@ EXPORT_SYMBOL(dm_table_put);
EXPORT_SYMBOL(dm_table_get);
EXPORT_SYMBOL(dm_table_unplug_all);
EXPORT_SYMBOL(dm_table_flush_all);
+EXPORT_SYMBOL(dm_table_get_md);
Index: sas-2.6-patched/drivers/md/dm-ioctl.c
===================================================================
--- sas-2.6-patched.orig/drivers/md/dm-ioctl.c 2006-02-20 01:05:32.000000000 -0800
+++ sas-2.6-patched/drivers/md/dm-ioctl.c 2006-02-20 01:42:29.000000000 -0800
@@ -972,27 +972,26 @@ static int populate_table(struct dm_tabl
static int table_load(struct dm_ioctl *param, size_t param_size)
{
- int r;
+ int r = -ENXIO;
struct hash_cell *hc;
struct dm_table *t;
- r = dm_table_create(&t, get_mode(param), param->target_count);
- if (r)
- return r;
-
- r = populate_table(t, param, param_size);
- if (r) {
- dm_table_put(t);
- return r;
- }
down_write(&_hash_lock);
hc = __find_device_hash_cell(param);
if (!hc) {
DMWARN("device doesn't appear to be in the dev hash table.");
- up_write(&_hash_lock);
- dm_table_put(t);
- return -ENXIO;
+ goto out;
+ }
+
+ r = dm_table_create(&t, get_mode(param), param->target_count,
+ hc->md);
+ if (r)
+ goto out;
+
+ r = populate_table(t, param, param_size);
+ if (r) {
+ goto table_out;
}
if (hc->new_map)
@@ -1001,6 +1000,13 @@ static int table_load(struct dm_ioctl *p
param->flags |= DM_INACTIVE_PRESENT_FLAG;
r = __dev_status(hc->md, param);
+
+ up_write(&_hash_lock);
+ return r;
+
+table_out:
+ dm_table_put(t);
+out:
up_write(&_hash_lock);
return r;
}
Hi,
Alasdair G Kergon wrote:
> This patch needs splitting up so that independent changes can be
> considered separately.
>
> c.f. The proposal from Mike Anderson (repeated below) which I prefer
> because it makes it clear that a table always belongs to exactly one md.
I like his proposed patch.
The interface is useful for my purpose too and moving table
creation inside _hash_lock means I don't need dm_get() neither.
Is it going to be pushed to upstream?
I'll remake my patch based on it.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
On Wed, Feb 22, 2006 at 12:13:56PM -0500, Jun'ichi Nomura wrote:
> Alasdair G Kergon wrote:
> > This patch needs splitting up so that independent changes can be
> > considered separately.
> > c.f. The proposal from Mike Anderson (repeated below) which I prefer
> > because it makes it clear that a table always belongs to exactly one md.
> I like his proposed patch.
> The interface is useful for my purpose too and moving table
> creation inside _hash_lock means I don't need dm_get() neither.
The global _hash_lock should not be held (thereby locking out most dm ioctl
operations on any device) while the slow populate_table() runs.
I'm trying out a variant of the patch that drops and reacquires that lock.
Alasdair
--
[email protected]
On Wed, Feb 22, 2006 at 11:06:24AM -0500, Jun'ichi Nomura wrote:
> Hello,
>
> This is a revised set of pathces which provides common
> representation of dependencies between stacked devices (dm and md)
> in sysfs.
>
> Variants of bd_claim/bd_release are added to accept a kobject
> and create symlinks between the claimed bdev and the holder.
>
> dm/md will give a child of its gendisk kobject to bd_claim.
> For example, if dm-0 maps to sda, we have the following symlinks;
> /sys/block/dm-0/slaves/sda --> /sys/block/sda
> /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
>
> Comments are welcome.
>
> A few points I would appreciate comments/reviews from maintainers:
> About sysfs
> - I confirmed sysfs_remove_symlink() and kobject_del() don't
> allocate memory in 2.6.15 and it seems true on the git head.
> I would like to make sure it's true in future versions of kernel
> because they are called during device-mapper's table swapping
> where I/O to free memory could deadlock on the dm device.
> What is the recommended way to do that?
But it can possibly sleep.
Hm, wait, the put_device stuff can possibly sleep, the "raw"
kobject_del() stuff looks safe. Either way, they don't create new
memory, unless you do something really wierd in your release callback.
So you should be safe here.
But if you want to be absolutly safe, look at the thread on lkml about
changing the scsi code to do the final release in a non-interrupt
context. That looks like it might be the same thing you want to do
here to guarantee that nothing bad happens.
thanks,
greg k-h
On Wed, Feb 22, 2006 at 11:13:00AM -0500, Jun'ichi Nomura wrote:
> +/* This is a mere directory in sysfs. No methods are needed. */
> +static struct kobj_type bd_holder_ktype = {
> + .release = NULL,
> + .sysfs_ops = NULL,
> + .default_attrs = NULL,
> +};
That doesn't look right. You always need a release function.
> +static inline void add_holder_dir(struct block_device *bdev)
> +{
> + struct kobject *kobj = &bdev->bd_holder_dir;
> +
> + kobj->ktype = &bd_holder_ktype;
> + kobject_set_name(kobj, "holders");
> + kobj->parent = bdev_get_kobj(bdev);
> + kobject_init(kobj);
> + kobject_add(kobj);
> + kobject_put(kobj->parent);
> +}
> +
> +static inline void del_holder_dir(struct block_device *bdev)
> +{
> + /*
> + * Don't kobject_unregister to avoid memory allocation
> + * in kobject_hotplug.
> + */
> + kobject_del(&bdev->bd_holder_dir);
> + kobject_put(&bdev->bd_holder_dir);
> +}
No, do it correctly please.
thanks,
greg k-h
Hi,
Alasdair G Kergon wrote:
> The global _hash_lock should not be held (thereby locking out most dm ioctl
> operations on any device) while the slow populate_table() runs.
>
> I'm trying out a variant of the patch that drops and reacquires that lock.
OK, thanks for the confirmation.
I guess the variant itself will need dm_get() to avoid md
being stolen. Depending on that, I might change my patch.
Attached is a revised patch based on Mike Anderson's patch.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
Hi Greg,
Thanks for comments.
Greg KH wrote:
>>+/* This is a mere directory in sysfs. No methods are needed. */
>>+static struct kobj_type bd_holder_ktype = {
>>+ .release = NULL,
>>+ .sysfs_ops = NULL,
>>+ .default_attrs = NULL,
>>+};
>
> That doesn't look right. You always need a release function.
I'll move them out to gendisk/hd_struct creation with proper
release function.
I thought it's correct because NULL release function is
just ignored in kobject_cleanup() and it let outside function
to release the whole structure.
But it seems wrong to embed these additional kobjects in
the structures which are logically separate from them.
>>+static inline void del_holder_dir(struct block_device *bdev)
>>+{
>>+ /*
>>+ * Don't kobject_unregister to avoid memory allocation
>>+ * in kobject_hotplug.
>>+ */
>>+ kobject_del(&bdev->bd_holder_dir);
>>+ kobject_put(&bdev->bd_holder_dir);
>>+}
>
> No, do it correctly please.
OK, I'll change them to kobject_unregister() and do it
when gendisk/hd_struct is removed.
Then we can avoid possible memory allocation in dm's atomic
operation, too.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
On Wed, Feb 22, 2006 at 05:22:02PM -0500, Jun'ichi Nomura wrote:
> Hi Greg,
>
> Thanks for comments.
>
> Greg KH wrote:
> >>+/* This is a mere directory in sysfs. No methods are needed. */
> >>+static struct kobj_type bd_holder_ktype = {
> >>+ .release = NULL,
> >>+ .sysfs_ops = NULL,
> >>+ .default_attrs = NULL,
> >>+};
> >
> >That doesn't look right. You always need a release function.
>
> I'll move them out to gendisk/hd_struct creation with proper
> release function.
>
> I thought it's correct because NULL release function is
> just ignored in kobject_cleanup() and it let outside function
> to release the whole structure.
> But it seems wrong to embed these additional kobjects in
> the structures which are logically separate from them.
>
> >>+static inline void del_holder_dir(struct block_device *bdev)
> >>+{
> >>+ /*
> >>+ * Don't kobject_unregister to avoid memory allocation
> >>+ * in kobject_hotplug.
> >>+ */
> >>+ kobject_del(&bdev->bd_holder_dir);
> >>+ kobject_put(&bdev->bd_holder_dir);
> >>+}
> >
> >No, do it correctly please.
>
> OK, I'll change them to kobject_unregister() and do it
> when gendisk/hd_struct is removed.
> Then we can avoid possible memory allocation in dm's atomic
> operation, too.
That sounds great.
thanks,
greg k-h
Hello Greg,
>>>>+/* This is a mere directory in sysfs. No methods are needed. */
>>>>+static struct kobj_type bd_holder_ktype = {
>>>>+ .release = NULL,
>>>>+ .sysfs_ops = NULL,
>>>>+ .default_attrs = NULL,
>>>>+};
>>>
>>>That doesn't look right. You always need a release function.
I updated the patch based your comments.
Could you take a look at this version whether there's
any problematic use of sysfs/kobjects?
- I removed embedded child-kobjects from struct block_device
and struct gendisk which I added in my previous patch.
Kobject registration occurs when gendisk or hd_struct is
registered. Release function of the kobject type is added.
- Reference counting of kobjects is done in much symmetric
manner than before.
- Added bd_claim_by_disk/bd_release_from_disk inline functions
to help proper reference counting.
Thanks,
--
Jun'ichi Nomura, NEC Solutions (America), Inc.
On Thu, Feb 23, 2006 at 02:15:45PM -0500, Jun'ichi Nomura wrote:
> Hello Greg,
>
> >>>>+/* This is a mere directory in sysfs. No methods are needed. */
> >>>>+static struct kobj_type bd_holder_ktype = {
> >>>>+ .release = NULL,
> >>>>+ .sysfs_ops = NULL,
> >>>>+ .default_attrs = NULL,
> >>>>+};
> >>>
> >>>That doesn't look right. You always need a release function.
>
> I updated the patch based your comments.
> Could you take a look at this version whether there's
> any problematic use of sysfs/kobjects?
>
> - I removed embedded child-kobjects from struct block_device
> and struct gendisk which I added in my previous patch.
> Kobject registration occurs when gendisk or hd_struct is
> registered. Release function of the kobject type is added.
> - Reference counting of kobjects is done in much symmetric
> manner than before.
> - Added bd_claim_by_disk/bd_release_from_disk inline functions
> to help proper reference counting.
Looks great, only one comment:
> --- linux-2.6.16-rc4/fs/partitions/check.c 2006-02-17 17:23:45.000000000 -0500
> +++ linux-2.6.16-rc4/fs/partitions/check.c 2006-02-22 23:18:06.000000000 -0500
> @@ -297,6 +297,56 @@ struct kobj_type ktype_part = {
> .sysfs_ops = &part_sysfs_ops,
> };
>
> +static void dir_release(struct kobject *kobj)
> +{
> + kfree(kobj);
> +}
> +
> +static struct kobj_type dir_ktype = {
> + .release = dir_release,
> + .sysfs_ops = NULL,
> + .default_attrs = NULL,
> +};
> +
> +static inline struct kobject *add_dir(struct kobject *parent, const char *name)
> +{
> + struct kobject *k;
> +
> + if (!parent)
> + return NULL;
> +
> + k = kmalloc(sizeof(*k), GFP_KERNEL);
> + if (!k)
> + return NULL;
> +
> + memset(k, 0, sizeof(*k));
> + k->parent = parent;
> + k->ktype = &dir_ktype;
> + kobject_set_name(k, name);
> + kobject_register(k);
> +
> + return k;
> +}
This code looks good enough that we should add it to the core kobject
code, don't you think? Also, you might use kzalloc instead of kmalloc
here.
thanks,
greg k-h
Hi Greg,
Thank you for the comments.
Greg KH wrote:
>>+static inline struct kobject *add_dir(struct kobject *parent, const char *name)
>>+{
>>+ struct kobject *k;
>>+
>>+ if (!parent)
>>+ return NULL;
>>+
>>+ k = kmalloc(sizeof(*k), GFP_KERNEL);
>>+ if (!k)
>>+ return NULL;
>>+
>>+ memset(k, 0, sizeof(*k));
>>+ k->parent = parent;
>>+ k->ktype = &dir_ktype;
>>+ kobject_set_name(k, name);
>>+ kobject_register(k);
>>+
>>+ return k;
>>+}
>
> This code looks good enough that we should add it to the core kobject
> code, don't you think? Also, you might use kzalloc instead of kmalloc
> here.
Yes, it would be nice if kobject core has this function.
I'll move them to lib/kobject.c.
--
Jun'ichi Nomura, NEC Solutions (America), Inc.