From: Matteo Croce <[email protected]>
With this series a monotonically increasing number is added to disks,
precisely in the genhd struct, and it's exported in sysfs and uevent.
This helps the userspace correlate events for devices that reuse the
same device, like loop.
The first patch is the core one, the 2..4 expose the information in
different ways, while the last one increase the sequence number for
loop devices at every attach.
# udevadm monitor -kp |grep -e ^DEVNAME -e ^DISKSEQ &
[1] 523
# losetup -fP 3part
[ 3698.615848] loop0: detected capacity change from 16384 to 0
DEVNAME=/dev/loop0
DISKSEQ=13
[ 3698.647189] loop0: p1 p2 p3
DEVNAME=/dev/loop0
DISKSEQ=13
DEVNAME=/dev/loop0p1
DISKSEQ=13
DEVNAME=/dev/loop0p2
DISKSEQ=13
DEVNAME=/dev/loop0p3
DISKSEQ=13
# losetup -fP 2part
[ 3705.170766] loop1: detected capacity change from 40960 to 0
DEVNAME=/dev/loop1
DISKSEQ=14
[ 3705.247280] loop1: p1 p2
DEVNAME=/dev/loop1
DISKSEQ=14
DEVNAME=/dev/loop1p1
DISKSEQ=14
DEVNAME=/dev/loop1p2
DISKSEQ=14
# ./getdiskseq /dev/loop*
/dev/loop0: 13
/dev/loop0p1: 13
/dev/loop0p2: 13
/dev/loop0p3: 13
/dev/loop1: 14
/dev/loop1p1: 14
/dev/loop1p2: 14
/dev/loop2: 5
/dev/loop3: 6
/dev/loop-control: Function not implemented
# grep . /sys/class/block/*/diskseq
/sys/class/block/loop0/diskseq:13
/sys/class/block/loop1/diskseq:14
/sys/class/block/loop2/diskseq:5
/sys/class/block/loop3/diskseq:6
/sys/class/block/ram0/diskseq:1
/sys/class/block/ram1/diskseq:2
/sys/class/block/vda/diskseq:7
If merged, this feature will immediately used by the userspace:
https://github.com/systemd/systemd/issues/17469#issuecomment-762919781
This is just a resend after the merge window and rebased on linux-block.
Also, added more people in CC from the get_maintainer.pl output
as well as -next in the subject.
Matteo Croce (5):
block: add disk sequence number
block: add ioctl to read the disk sequence number
block: refactor sysfs code
block: export diskseq in sysfs
loop: increment sequence number
Documentation/ABI/testing/sysfs-block | 12 ++++++++
block/genhd.c | 43 ++++++++++++++++++++++++---
block/ioctl.c | 2 ++
drivers/block/loop.c | 3 ++
include/linux/genhd.h | 2 ++
include/uapi/linux/fs.h | 1 +
6 files changed, 59 insertions(+), 4 deletions(-)
--
2.30.2
From: Matteo Croce <[email protected]>
Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number
from the genhd structure.
Signed-off-by: Matteo Croce <[email protected]>
---
block/ioctl.c | 2 ++
include/uapi/linux/fs.h | 1 +
2 files changed, 3 insertions(+)
diff --git a/block/ioctl.c b/block/ioctl.c
index ff241e663c01..266315d00942 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -467,6 +467,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
BLKDEV_DISCARD_SECURE);
case BLKZEROOUT:
return blk_ioctl_zeroout(bdev, mode, arg);
+ case BLKGETDISKSEQ:
+ return put_u64(argp, bdev->bd_disk->diskseq);
case BLKREPORTZONE:
return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
case BLKRESETZONE:
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f44eb0a04afd..5dc72bbdd9b7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -184,6 +184,7 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
--
2.30.2
From: Matteo Croce <[email protected]>
Add a sequence number to the disk devices. This number is put in the
uevent so userspace can correlate events when a driver reuses a device,
like the loop one.
Signed-off-by: Matteo Croce <[email protected]>
---
block/genhd.c | 19 +++++++++++++++++++
include/linux/genhd.h | 2 ++
2 files changed, 21 insertions(+)
diff --git a/block/genhd.c b/block/genhd.c
index 8c8f543572e6..92debcb9e061 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1215,8 +1215,17 @@ static void disk_release(struct device *dev)
blk_put_queue(disk->queue);
kfree(disk);
}
+
+static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
+}
+
struct class block_class = {
.name = "block",
+ .dev_uevent = block_uevent,
};
static char *block_devnode(struct device *dev, umode_t *mode,
@@ -1388,6 +1397,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
+ inc_diskseq(disk);
+
return disk;
out_destroy_part_tbl:
@@ -1938,3 +1949,11 @@ static void disk_release_events(struct gendisk *disk)
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}
+
+void inc_diskseq(struct gendisk *disk)
+{
+ static atomic64_t diskseq;
+
+ disk->diskseq = atomic64_inc_return(&diskseq);
+}
+EXPORT_SYMBOL_GPL(inc_diskseq);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index f364619092cc..632141b360d2 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -167,6 +167,7 @@ struct gendisk {
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
+ u64 diskseq;
};
/*
@@ -326,6 +327,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
#endif /* CONFIG_SYSFS */
extern struct rw_semaphore bdev_lookup_sem;
+extern void inc_diskseq(struct gendisk *disk);
dev_t blk_lookup_devt(const char *name, int partno);
void blk_request_module(dev_t devt);
--
2.30.2
From: Matteo Croce <[email protected]>
Move the sysfs register code from a function named disk_add_events() to
a new function named disk_add_sysfs(). Also, rename the attribute list
with a more generic name than disk_events_attrs.
Signed-off-by: Matteo Croce <[email protected]>
---
block/genhd.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index 92debcb9e061..57d92ea7ae05 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -41,6 +41,7 @@ static void disk_alloc_events(struct gendisk *disk);
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
+static void disk_add_sysfs(struct gendisk *disk);
void set_capacity(struct gendisk *disk, sector_t sectors)
{
@@ -628,6 +629,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));
+ disk_add_sysfs(disk);
disk_add_events(disk);
blk_integrity_add(disk);
}
@@ -1838,7 +1840,7 @@ static const DEVICE_ATTR(events_poll_msecs, 0644,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
-static const struct attribute *disk_events_attrs[] = {
+static const struct attribute *disk_sysfs_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
@@ -1909,13 +1911,16 @@ static void disk_alloc_events(struct gendisk *disk)
disk->ev = ev;
}
-static void disk_add_events(struct gendisk *disk)
+static void disk_add_sysfs(struct gendisk *disk)
{
/* FIXME: error handling */
- if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
+ if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs) < 0)
pr_warn("%s: failed to create sysfs files for events\n",
disk->disk_name);
+}
+static void disk_add_events(struct gendisk *disk)
+{
if (!disk->ev)
return;
@@ -1940,7 +1945,7 @@ static void disk_del_events(struct gendisk *disk)
mutex_unlock(&disk_events_mutex);
}
- sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+ sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs);
}
static void disk_release_events(struct gendisk *disk)
--
2.30.2
From: Matteo Croce <[email protected]>
On a very loaded system, if there are many events queued up from multiple
attach/detach cycles, it's impossible to match them up with the
LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
of our own association in the queue is[1].
Not even an empty uevent queue is a reliable indication that we already
received the uevent we were waiting for, since with multi-partition block
devices each partition's event is queued asynchronously and might be
delivered later.
Increment the disk sequence number when setting or changing the backing
file, so the userspace knows which backing file generated the event.
[1] https://github.com/systemd/systemd/issues/17469#issuecomment-762919781
Signed-off-by: Matteo Croce <[email protected]>
---
drivers/block/loop.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a370cde3ddd4..1541ccff81f9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -734,6 +734,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
goto out_err;
/* and ... switch */
+ inc_diskseq(lo->lo_disk);
blk_mq_freeze_queue(lo->lo_queue);
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
@@ -1122,6 +1123,8 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
if (error)
goto out_unlock;
+ inc_diskseq(lo->lo_disk);
+
if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
!file->f_op->write_iter)
lo->lo_flags |= LO_FLAGS_READ_ONLY;
--
2.30.2
On Mon, Mar 15, 2021 at 09:02:39PM +0100, Matteo Croce wrote:
> +++ b/include/uapi/linux/fs.h
> @@ -184,6 +184,7 @@ struct fsxattr {
> #define BLKSECDISCARD _IO(0x12,125)
> #define BLKROTATIONAL _IO(0x12,126)
> #define BLKZEROOUT _IO(0x12,127)
> +#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
> /*
> * A jump here: 130-131 are reserved for zoned block devices
> * (see uapi/linux/blkzoned.h)
Not your bug, but this is now 130-136.
+cc all the people who signed off on the commits that added those ioctl
numbers without updating this comment. Perhaps one of them will figure
out how to stop this happening in future.
On 2021/03/16 5:14, Matthew Wilcox wrote:
> On Mon, Mar 15, 2021 at 09:02:39PM +0100, Matteo Croce wrote:
>> +++ b/include/uapi/linux/fs.h
>> @@ -184,6 +184,7 @@ struct fsxattr {
>> #define BLKSECDISCARD _IO(0x12,125)
>> #define BLKROTATIONAL _IO(0x12,126)
>> #define BLKZEROOUT _IO(0x12,127)
>> +#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
>> /*
>> * A jump here: 130-131 are reserved for zoned block devices
>> * (see uapi/linux/blkzoned.h)
>
> Not your bug, but this is now 130-136.
>
> +cc all the people who signed off on the commits that added those ioctl
> numbers without updating this comment. Perhaps one of them will figure
> out how to stop this happening in future.
>
Indeed. Will be more careful :)
And send a patch to fix this.
Thanks !
--
Damien Le Moal
Western Digital Research
On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > Add a sequence number to the disk devices. This number is put in the
> > uevent so userspace can correlate events when a driver reuses a device,
> > like the loop one.
>
> Should this be documented as monotonically increasing? I think this
> is actually a media identifier. Consider (if you will) a floppy disc.
> Back when such things were common, it was possible with personal computers
> of the era to have multiple floppy discs "in play" and be prompted to
> insert them as needed. So shouldn't it be possible to support something
> similar here -- you're really removing the media from the loop device.
> With a monotonically increasing number, you're always destroying the
> media when you remove it, but in principle, it should be possible to
> reinsert the same media and have the same media identifier number.
So ... a lot of devices have UUIDs or similar. eg:
$ cat /sys/block/nvme0n1/uuid
e8238fa6-bf53-0001-001b-448b49cec94f
https://linux.die.net/man/8/scsi_id (for scsi)
how about making this way more generic; create an xattr on a file to
store the uuid (if one doesn't already exist) whenever it's used as the
base for a loop device. then sysfs (or whatever) can report the contents
of that xattr as the unique id.
That can be mostly in userspace -- losetup can create it, and read it.
It can be passed in as the first two current-reserved __u64 entries in
loop_config. The only kernel change should be creating the sysfs
entry /sys/block/loopN/uuid from those two array entries.
On Mo, 15.03.21 21:04, Matthew Wilcox ([email protected]) wrote:
> On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> > On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > > From: Matteo Croce <[email protected]>
> > >
> > > Add a sequence number to the disk devices. This number is put in the
> > > uevent so userspace can correlate events when a driver reuses a device,
> > > like the loop one.
> >
> > Should this be documented as monotonically increasing? I think this
> > is actually a media identifier. Consider (if you will) a floppy disc.
> > Back when such things were common, it was possible with personal computers
> > of the era to have multiple floppy discs "in play" and be prompted to
> > insert them as needed. So shouldn't it be possible to support something
> > similar here -- you're really removing the media from the loop device.
> > With a monotonically increasing number, you're always destroying the
> > media when you remove it, but in principle, it should be possible to
> > reinsert the same media and have the same media identifier number.
>
> So ... a lot of devices have UUIDs or similar. eg:
>
> $ cat /sys/block/nvme0n1/uuid
> e8238fa6-bf53-0001-001b-448b49cec94f
>
> https://linux.die.net/man/8/scsi_id (for scsi)
>
> how about making this way more generic; create an xattr on a file to
> store the uuid (if one doesn't already exist) whenever it's used as the
> base for a loop device. then sysfs (or whatever) can report the contents
> of that xattr as the unique id.
>
> That can be mostly in userspace -- losetup can create it, and read it.
> It can be passed in as the first two current-reserved __u64 entries in
> loop_config. The only kernel change should be creating the sysfs
> entry /sys/block/loopN/uuid from those two array entries.
As a (part-time) maintainer of udev: as one major likely consumer of
this I'd *really* prefer some concept here that works without
`losetup` needing to be patched. i.e. we have plenty userspace that
calls LOOP_CONFIGURE or LOOP_SET_FD, not just losetup, and we'd have
to patch them all. In particular in a world of containers it's even
worse: people probably will continue to use old userspaces (mixed with
newer ones) for a very long time (decades!), and those old userpace
won't fill in the fields for the ioctl hence.
Hence, for me it would be essential to have an identifier that is
assigned by the kernel, instead of requiring userspace to assign it,
because userspace won't for a long long time.
I'd be OK with a hybrid approach where userspace *can* fill
something in, but doesn't have to in which case the kernel would fill
it in.
That all said, I very much prefer if we'd use a kernel-enforced
"sequence number" or "generation counter" or so for this instead of a
uuid or random cookie or so. Why? because it allows userspace that
monitors things to derive ordering from these ids: when you watch
these events and see a uevent for a device seqno=4711 then you know
that it is from an earlier use than one you see for seqno=8878. UUIDs
can't give you that. That's in particular a nice property since
uevents/netlink are not a reliable transport: messages can get lost
when the socket buffers overrun, or when udev as the uevent broker
gets overloaded. Hence, for a userspace program it's kinda nice to
know whether it' worth waiting for a specific loop device use or if
it's clear that ship has sailed already: i.e. if my own use of a
specific loop device gets seqno 777 then I know it still makes sense
to wait for appropriate uevents as long as I see seqno <= 776. But if
we I see seqneo >= 778 then I know it's not worth waiting anymore and
one component in the uevent message chain has dropped my messages.
But of course, beggars can't be choosers. If a seqno/generation
counter concept is not in the cards, I'd be OK with a uuid/random
cooie approach too. And if an approach where the kernel assigns these
seqnos strictly monotonically is not in the cards, then I'd be OK with
an approach where userspace can pick the ids, too. I'll take what I
can get. My primary concern is that we get something to match up
uevents, partition devices and the main block device with, and all of
the suggested approaches could deliver that.
Lennart
--
Lennart Poettering, Berlin
From: Matteo Croce <[email protected]>
Add a new sysfs handle to export the new diskseq value.
Place it in <sysfs>/block/<disk>/diskseq and document it.
Signed-off-by: Matteo Croce <[email protected]>
---
Documentation/ABI/testing/sysfs-block | 12 ++++++++++++
block/genhd.c | 11 +++++++++++
2 files changed, 23 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index e34cdeeeb9d4..a0ed87386639 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -28,6 +28,18 @@ Description:
For more details refer Documentation/admin-guide/iostats.rst
+What: /sys/block/<disk>/diskseq
+Date: February 2021
+Contact: Matteo Croce <[email protected]>
+Description:
+ The /sys/block/<disk>/diskseq files reports the disk
+ sequence number, which is a monotonically increasing
+ number assigned to every drive.
+ Some devices, like the loop device, refresh such number
+ every time the backing file is changed.
+ The value type is 64 bit unsigned.
+
+
What: /sys/block/<disk>/<part>/stat
Date: February 2008
Contact: Jerome Marchand <[email protected]>
diff --git a/block/genhd.c b/block/genhd.c
index 57d92ea7ae05..6a7ed426def0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1764,6 +1764,7 @@ static void disk_check_events(struct disk_events *ev,
* events_async : list of events which can be detected w/o polling
* (always empty, only for backwards compatibility)
* events_poll_msecs : polling interval, 0: disable, -1: system default
+ * diskseq : disk sequence number, since boot
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
@@ -1834,16 +1835,26 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
return count;
}
+static ssize_t diskseq_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%llu\n", disk->diskseq);
+}
+
static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, 0644,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
+static const DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static const struct attribute *disk_sysfs_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
+ &dev_attr_diskseq.attr,
NULL,
};
--
2.30.2
On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> From: Matteo Croce <[email protected]>
>
> Add a sequence number to the disk devices. This number is put in the
> uevent so userspace can correlate events when a driver reuses a device,
> like the loop one.
Should this be documented as monotonically increasing? I think this
is actually a media identifier. Consider (if you will) a floppy disc.
Back when such things were common, it was possible with personal computers
of the era to have multiple floppy discs "in play" and be prompted to
insert them as needed. So shouldn't it be possible to support something
similar here -- you're really removing the media from the loop device.
With a monotonically increasing number, you're always destroying the
media when you remove it, but in principle, it should be possible to
reinsert the same media and have the same media identifier number.
On Mon, Mar 15, 2021 at 9:13 PM Matthew Wilcox <[email protected]> wrote:
>
> On Mon, Mar 15, 2021 at 09:02:39PM +0100, Matteo Croce wrote:
> > +++ b/include/uapi/linux/fs.h
> > @@ -184,6 +184,7 @@ struct fsxattr {
> > #define BLKSECDISCARD _IO(0x12,125)
> > #define BLKROTATIONAL _IO(0x12,126)
> > #define BLKZEROOUT _IO(0x12,127)
> > +#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
> > /*
> > * A jump here: 130-131 are reserved for zoned block devices
> > * (see uapi/linux/blkzoned.h)
>
> Not your bug, but this is now 130-136.
>
> +cc all the people who signed off on the commits that added those ioctl
> numbers without updating this comment. Perhaps one of them will figure
> out how to stop this happening in future.
Note taken, thanks!
--
per aspera ad upstream
On 3/16/21 4:02 AM, Matteo Croce wrote:
> From: Matteo Croce <[email protected]>
>
> Add a sequence number to the disk devices. This number is put in the
> uevent so userspace can correlate events when a driver reuses a device,
> like the loop one.
>
> Signed-off-by: Matteo Croce <[email protected]>
> ---
> block/genhd.c | 19 +++++++++++++++++++
> include/linux/genhd.h | 2 ++
> 2 files changed, 21 insertions(+)
>
> diff --git a/block/genhd.c b/block/genhd.c
> index 8c8f543572e6..92debcb9e061 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -1215,8 +1215,17 @@ static void disk_release(struct device *dev)
> blk_put_queue(disk->queue);
> kfree(disk);
> }
> +
> +static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
> +{
> + struct gendisk *disk = dev_to_disk(dev);
> +
> + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
> +}
> +
> struct class block_class = {
> .name = "block",
> + .dev_uevent = block_uevent,
> };
>
> static char *block_devnode(struct device *dev, umode_t *mode,
> @@ -1388,6 +1397,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
> disk_to_dev(disk)->class = &block_class;
> disk_to_dev(disk)->type = &disk_type;
> device_initialize(disk_to_dev(disk));
> + inc_diskseq(disk);
> +
> return disk;
>
> out_destroy_part_tbl:
> @@ -1938,3 +1949,11 @@ static void disk_release_events(struct gendisk *disk)
> WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
> kfree(disk->ev);
> }
> +
> +void inc_diskseq(struct gendisk *disk)
> +{
> + static atomic64_t diskseq;
> +
> + disk->diskseq = atomic64_inc_return(&diskseq);
> +}
> +EXPORT_SYMBOL_GPL(inc_diskseq);
Hi, I'm quite interested in this 'seqnum'. Actually I'm also planing to
add support for some sort of 'seqnum' when supporting IO polling for dm
devices, so that every time dm device changes its dm table, the seqnum
will be increased.
As for your patch, @diskseq is declared as one static variable in
inc_diskseq(). Then I doubt if all callers of inc_diskseq() will share
*one* counting when inc_diskseq() is compiled as the separate call entry
rather than inlined.
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index f364619092cc..632141b360d2 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -167,6 +167,7 @@ struct gendisk {
> int node_id;
> struct badblocks *bb;
> struct lockdep_map lockdep_map;
> + u64 diskseq;
> };
>
> /*
> @@ -326,6 +327,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
> #endif /* CONFIG_SYSFS */
>
> extern struct rw_semaphore bdev_lookup_sem;
> +extern void inc_diskseq(struct gendisk *disk);
>
> dev_t blk_lookup_devt(const char *name, int partno);
> void blk_request_module(dev_t devt);
>
--
Thanks,
Jeffle
On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > Add a sequence number to the disk devices. This number is put in the
> > uevent so userspace can correlate events when a driver reuses a device,
> > like the loop one.
>
> Should this be documented as monotonically increasing? I think this
> is actually a media identifier. Consider (if you will) a floppy disc.
> Back when such things were common, it was possible with personal computers
> of the era to have multiple floppy discs "in play" and be prompted to
> insert them as needed. So shouldn't it be possible to support something
> similar here -- you're really removing the media from the loop device.
> With a monotonically increasing number, you're always destroying the
> media when you remove it, but in principle, it should be possible to
> reinsert the same media and have the same media identifier number.
And we have some decent infrastructure related to media changes,
grep for disk_events. I think this needs to plug into that
infrastructure instead of duplicating it.
On Tue, Mar 16, 2021 at 2:44 AM JeffleXu <[email protected]> wrote:
>
> On 3/16/21 4:02 AM, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > Add a sequence number to the disk devices. This number is put in the
> > uevent so userspace can correlate events when a driver reuses a device,
> > like the loop one.
>
> Hi, I'm quite interested in this 'seqnum'. Actually I'm also planing to
> add support for some sort of 'seqnum' when supporting IO polling for dm
> devices, so that every time dm device changes its dm table, the seqnum
> will be increased.
>
Interesting, thanks!
> As for your patch, @diskseq is declared as one static variable in
> inc_diskseq(). Then I doubt if all callers of inc_diskseq() will share
> *one* counting when inc_diskseq() is compiled as the separate call entry
> rather than inlined.
>
That would be true if the static declaration was in the .h, but being
in genhd.c it goes in vmlinux once.
Maybe you get confused by the inc_diskseq prototype in the header
file, but the function body is in the .c
Regards,
--
per aspera ad upstream
On Mon, Mar 15, 2021 at 10:05 PM Matthew Wilcox <[email protected]> wrote:
>
> On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> > On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > > From: Matteo Croce <[email protected]>
> > >
> > > Add a sequence number to the disk devices. This number is put in the
> > > uevent so userspace can correlate events when a driver reuses a device,
> > > like the loop one.
> >
> > Should this be documented as monotonically increasing? I think this
> > is actually a media identifier. Consider (if you will) a floppy disc.
> > Back when such things were common, it was possible with personal computers
> > of the era to have multiple floppy discs "in play" and be prompted to
> > insert them as needed. So shouldn't it be possible to support something
> > similar here -- you're really removing the media from the loop device.
> > With a monotonically increasing number, you're always destroying the
> > media when you remove it, but in principle, it should be possible to
> > reinsert the same media and have the same media identifier number.
>
> So ... a lot of devices have UUIDs or similar. eg:
>
> $ cat /sys/block/nvme0n1/uuid
> e8238fa6-bf53-0001-001b-448b49cec94f
>
> https://linux.die.net/man/8/scsi_id (for scsi)
>
Hi,
I don't have uuid anywhere:
matteo@saturno:~$ ll /dev/sd?
brw-rw---- 1 root disk 8, 0 feb 16 13:24 /dev/sda
brw-rw---- 1 root disk 8, 16 feb 16 13:24 /dev/sdb
brw-rw---- 1 root disk 8, 32 feb 16 13:24 /dev/sdc
brw-rw---- 1 root disk 8, 48 feb 16 13:24 /dev/sdd
brw-rw---- 1 root disk 8, 64 mar 4 06:26 /dev/sde
brw-rw---- 1 root disk 8, 80 feb 16 13:24 /dev/sdf
matteo@saturno:~$ ll /sys/block/*/uuid
ls: cannot access '/sys/block/*/uuid': No such file or directory
mcroce@t490s:~$ ll /dev/nvme0n1
brw-rw----. 1 root disk 259, 0 25 mar 14.22 /dev/nvme0n1
mcroce@t490s:~$ ll /sys/block/*/uuid
ls: cannot access '/sys/block/*/uuid': No such file or directory
I find it only on a mdraid array:
$ cat /sys/devices/virtual/block/md127/md/uuid
26117338-4f54-f14e-b5d4-93feb7fe825d
I'm using a vanilla 5.11 kernel.
Regards,
--
per aspera ad upstream
On Mo, 15.03.21 20:18, Matthew Wilcox ([email protected]) wrote:
65;6203;1c
> On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > Add a sequence number to the disk devices. This number is put in the
> > uevent so userspace can correlate events when a driver reuses a device,
> > like the loop one.
>
> Should this be documented as monotonically increasing?
I think this would be great. My usecase for this would be to match up
uevents with loopback block device attachments, because that's
basically impossible right now: you attach a loopback device to a
file, and then wait for the relevant uevents to happen, for all
partitions but you cannot do this safely right now, since loopback
block devices are heavily reused in many scenarios so you never know
if a uevent is from the attachment you created yourself or from a
previous one — or even already for the next.
If this would be documented as being monotonic this would be excellent
for this usecase: if you know that your own use of a specific loopback
device got seqno x then you know that if you see uevents for seqno < x
it makes sense to wait longer, but when you see seqno > x then you
know it's too late, somehow you lost uevents and hsould abort.
Hence: for my usecase having this strictly monotonic, and thus being
able to *order* attachments across all areas where the seqno appears
would be absolutely excellent and make this as robust as it possibly
could be.
> I think this is actually a media identifier. Consider (if you will)
> a floppy disc. Back when such things were common, it was possible
> with personal computers of the era to have multiple floppy discs "in
> play" and be prompted to insert them as needed. So shouldn't it be
> possible to support something similar here -- you're really removing
> the media from the loop device. With a monotonically increasing
> number, you're always destroying the media when you remove it, but
> in principle, it should be possible to reinsert the same media and
> have the same media identifier number.
This would be useless for my usecase, we don't really care for the
precise file being attached (which is queriable via sysfs anyway), but
we want to match up our use of the device with the uevents it
generates on itself and decendend partition block devices.
Hence: for my usecase I want something that recognizes *attachments*
and not media. If i attach the same media 3 times i want to be able to
discern the three times. And more importantly: if I attach it once and
someone else also once, then I don't want to get confused by that and
be able ti distinguish both attachments.
Morevoer, I am not even sure what media identifier would mean: if you
have one image and then copy it, is that still the same image? in your
model, should that have distinct ids? or the same, because it is from
the same common original version? and if i then modify one, what
happens then?
Finally, media usually comes with ids anyway. i.e. file systems have
uuids, GPT partition tables have meda uuids. The infrastructure for
that already exists. What we need really is something that allows us
to track attachments, not media.
(That said, I think it would make sense to bump the IDs not only on
explicit user-induced reattachments, but also when media is replaced,
i.e. bump it more often than not)
Lennart
--
Lennart Poettering, Berlin
On Mo, 15.03.21 21:04, Matthew Wilcox ([email protected]) wrote:
> On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> > On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > > From: Matteo Croce <[email protected]>
> > >
> > > Add a sequence number to the disk devices. This number is put in the
> > > uevent so userspace can correlate events when a driver reuses a device,
> > > like the loop one.
> >
> > Should this be documented as monotonically increasing? I think this
> > is actually a media identifier. Consider (if you will) a floppy disc.
> > Back when such things were common, it was possible with personal computers
> > of the era to have multiple floppy discs "in play" and be prompted to
> > insert them as needed. So shouldn't it be possible to support something
> > similar here -- you're really removing the media from the loop device.
> > With a monotonically increasing number, you're always destroying the
> > media when you remove it, but in principle, it should be possible to
> > reinsert the same media and have the same media identifier number.
>
> So ... a lot of devices have UUIDs or similar. eg:
>
> $ cat /sys/block/nvme0n1/uuid
> e8238fa6-bf53-0001-001b-448b49cec94f
>
> https://linux.die.net/man/8/scsi_id (for scsi)
>
> how about making this way more generic; create an xattr on a file to
> store the uuid (if one doesn't already exist) whenever it's used as the
> base for a loop device. then sysfs (or whatever) can report the contents
> of that xattr as the unique id.
>
> That can be mostly in userspace -- losetup can create it, and read it.
> It can be passed in as the first two current-reserved __u64 entries in
> loop_config. The only kernel change should be creating the sysfs
> entry /sys/block/loopN/uuid from those two array entries.
I prefer seqnos over uuids because we can order them when we see a
bunch of uevents for the same loopback device with their seqnos, as
mentioned in that other mail.
But beggars can't be choosers. If we could propagate some uuid from
the loopback setup ioctl into the device so that that appears via
sysfs that would work too for me, but not as robustly, since we lack
the ordering to detect whether it's worth waiting for more uevents or
if already somebody else took possesion of the device.
TLDR: seqnos FTW! but uuids assigned at attachment time is better than
nothing.
Lennart
--
Lennart Poettering, Berlin
On 3/25/21 6:29 PM, Matteo Croce wrote:
> On Mon, Mar 15, 2021 at 10:05 PM Matthew Wilcox <[email protected]> wrote:
>>
>> On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
>>> On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
>>>> From: Matteo Croce <[email protected]>
>>>>
>>>> Add a sequence number to the disk devices. This number is put in the
>>>> uevent so userspace can correlate events when a driver reuses a device,
>>>> like the loop one.
>>>
>>> Should this be documented as monotonically increasing? I think this
>>> is actually a media identifier. Consider (if you will) a floppy disc.
>>> Back when such things were common, it was possible with personal computers
>>> of the era to have multiple floppy discs "in play" and be prompted to
>>> insert them as needed. So shouldn't it be possible to support something
>>> similar here -- you're really removing the media from the loop device.
>>> With a monotonically increasing number, you're always destroying the
>>> media when you remove it, but in principle, it should be possible to
>>> reinsert the same media and have the same media identifier number.
>>
>> So ... a lot of devices have UUIDs or similar. eg:
>>
>> $ cat /sys/block/nvme0n1/uuid
>> e8238fa6-bf53-0001-001b-448b49cec94f
>>
>> https://linux.die.net/man/8/scsi_id (for scsi)
>>
>
> Hi,
>
> I don't have uuid anywhere:
>
> matteo@saturno:~$ ll /dev/sd?
> brw-rw---- 1 root disk 8, 0 feb 16 13:24 /dev/sda
> brw-rw---- 1 root disk 8, 16 feb 16 13:24 /dev/sdb
> brw-rw---- 1 root disk 8, 32 feb 16 13:24 /dev/sdc
> brw-rw---- 1 root disk 8, 48 feb 16 13:24 /dev/sdd
> brw-rw---- 1 root disk 8, 64 mar 4 06:26 /dev/sde
> brw-rw---- 1 root disk 8, 80 feb 16 13:24 /dev/sdf
> matteo@saturno:~$ ll /sys/block/*/uuid
> ls: cannot access '/sys/block/*/uuid': No such file or directory
>
> mcroce@t490s:~$ ll /dev/nvme0n1
> brw-rw----. 1 root disk 259, 0 25 mar 14.22 /dev/nvme0n1
> mcroce@t490s:~$ ll /sys/block/*/uuid
> ls: cannot access '/sys/block/*/uuid': No such file or directory
>
> I find it only on a mdraid array:
>
> $ cat /sys/devices/virtual/block/md127/md/uuid
> 26117338-4f54-f14e-b5d4-93feb7fe825d
>
> I'm using a vanilla 5.11 kernel.
>
The 'uuid' is optional for NVMe devices, and indeed not even present for
other device types.
Use the 'wwid' attribute, which contains a unique identifier for all
nvme devices:
# cat /sys/block/nvme*/wwid
nvme.8086-4356504436343735303034323430304e474e-564f303430304b45464a42-00000001
nvme.8086-4356504436343735303034363430304e474e-564f303430304b45464a42-00000001
uuid.3c6500ee-a775-4c89-b223-e9551f5a9f7a
and for SCSI the wwid is part of the SCSI device:
# cat /sys/block/sd*/device/wwid
naa.600508b1001ce2e648a35b6ec14a3996
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On Di, 16.03.21 14:13, Christoph Hellwig ([email protected]) wrote:
> On Mon, Mar 15, 2021 at 08:18:24PM +0000, Matthew Wilcox wrote:
> > On Mon, Mar 15, 2021 at 09:02:38PM +0100, Matteo Croce wrote:
> > > From: Matteo Croce <[email protected]>
> > >
> > > Add a sequence number to the disk devices. This number is put in the
> > > uevent so userspace can correlate events when a driver reuses a device,
> > > like the loop one.
> >
> > Should this be documented as monotonically increasing? I think this
> > is actually a media identifier. Consider (if you will) a floppy disc.
> > Back when such things were common, it was possible with personal computers
> > of the era to have multiple floppy discs "in play" and be prompted to
> > insert them as needed. So shouldn't it be possible to support something
> > similar here -- you're really removing the media from the loop device.
> > With a monotonically increasing number, you're always destroying the
> > media when you remove it, but in principle, it should be possible to
> > reinsert the same media and have the same media identifier number.
>
> And we have some decent infrastructure related to media changes,
> grep for disk_events. I think this needs to plug into that
> infrastructure instead of duplicating it.
I'd argue this makes sense in one way only, i.e. that whenever the
media_change event is seen the seqnum is implicitly bumped.
I am pretty sure though that loopback devices shouldn't synthesize
media_change events themselves though. There's quite a difference I
would argue between a real media change event caused by external
effect (i.e. humans/hw buttons/sensors) to loop device reuse, which is
exclusively triggered by internal events (i.e. local code). Moreover I
think the loopback subsystem should manage the seqnum on its own,
since it ideally would return the assigned seqnum immediately from the
attachment ioctl, i.e. it shouldn't just be a side-effect of
attachment, but a part of it, if you follow what I mean.
Does that make sense?
Matteo, would it make sense to extend your patch set to bump the
seqnum implicitly on media_change for devices that implement that?
Lennart