From: Matteo Croce <[email protected]>
With this series a monotonically increasing number is added to disks,
precisely in the genhd struct, and it's exported in sysfs and uevent.
This helps the userspace correlate events for devices that reuse the
same device, like loop.
The first patch is the core one, the 2..4 expose the information in
different ways, the 5th increases the seqnum on media change and
the last one increases the sequence number for loop devices upon
attach, detach or reconfigure.
If merged, this feature will immediately used by the userspace:
https://github.com/systemd/systemd/issues/17469#issuecomment-762919781
v2 -> v3:
- rebased on top of 5.13-rc7
- resend because it appeared archived on patchwork
v1 -> v2:
- increase seqnum on media change
- increase on loop detach
Matteo Croce (6):
block: add disk sequence number
block: add ioctl to read the disk sequence number
block: refactor sysfs code
block: export diskseq in sysfs
block: increment sequence number
loop: increment sequence number
Documentation/ABI/testing/sysfs-block | 12 +++++++
block/genhd.c | 46 ++++++++++++++++++++++++---
block/ioctl.c | 2 ++
drivers/block/loop.c | 5 +++
include/linux/genhd.h | 2 ++
include/uapi/linux/fs.h | 1 +
6 files changed, 64 insertions(+), 4 deletions(-)
--
2.31.1
From: Matteo Croce <[email protected]>
Add a monotonically increasing number to disk devices. This number is put
in the uevent so userspace can correlate events when a driver reuses a
device, like cdrom or loop.
$ udevadm info /sys/class/block/* |grep -e DEVNAME -e DISKSEQ
E: DEVNAME=/dev/loop0
E: DISKSEQ=1
E: DEVNAME=/dev/loop1
E: DISKSEQ=2
E: DEVNAME=/dev/loop2
E: DISKSEQ=3
E: DEVNAME=/dev/loop3
E: DISKSEQ=4
E: DEVNAME=/dev/loop4
E: DISKSEQ=5
E: DEVNAME=/dev/loop5
E: DISKSEQ=6
E: DEVNAME=/dev/loop6
E: DISKSEQ=7
E: DEVNAME=/dev/loop7
E: DISKSEQ=8
E: DEVNAME=/dev/nvme0n1
E: DISKSEQ=9
E: DEVNAME=/dev/nvme0n1p1
E: DISKSEQ=9
E: DEVNAME=/dev/nvme0n1p2
E: DISKSEQ=9
E: DEVNAME=/dev/nvme0n1p3
E: DISKSEQ=9
E: DEVNAME=/dev/nvme0n1p4
E: DISKSEQ=9
E: DEVNAME=/dev/nvme0n1p5
E: DISKSEQ=9
E: DEVNAME=/dev/sda
E: DISKSEQ=10
E: DEVNAME=/dev/sda1
E: DISKSEQ=10
E: DEVNAME=/dev/sda2
E: DISKSEQ=10
Signed-off-by: Matteo Croce <[email protected]>
---
block/genhd.c | 19 +++++++++++++++++++
include/linux/genhd.h | 2 ++
2 files changed, 21 insertions(+)
diff --git a/block/genhd.c b/block/genhd.c
index 9f8cb7beaad1..c96b667136ee 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1129,8 +1129,17 @@ static void disk_release(struct device *dev)
blk_put_queue(disk->queue);
kfree(disk);
}
+
+static int block_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
+}
+
struct class block_class = {
.name = "block",
+ .dev_uevent = block_uevent,
};
static char *block_devnode(struct device *dev, umode_t *mode,
@@ -1304,6 +1313,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
+ inc_diskseq(disk);
+
return disk;
out_destroy_part_tbl:
@@ -1854,3 +1865,11 @@ static void disk_release_events(struct gendisk *disk)
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}
+
+void inc_diskseq(struct gendisk *disk)
+{
+ static atomic64_t diskseq;
+
+ disk->diskseq = atomic64_inc_return(&diskseq);
+}
+EXPORT_SYMBOL_GPL(inc_diskseq);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6fc26f7bdf71..a0d04250a2db 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -167,6 +167,7 @@ struct gendisk {
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
+ u64 diskseq;
};
/*
@@ -306,6 +307,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
}
#endif /* CONFIG_SYSFS */
+void inc_diskseq(struct gendisk *disk);
dev_t blk_lookup_devt(const char *name, int partno);
void blk_request_module(dev_t devt);
#ifdef CONFIG_BLOCK
--
2.31.1
From: Matteo Croce <[email protected]>
Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number
from the genhd structure.
# ./getdiskseq /dev/loop*
/dev/loop0: 13
/dev/loop0p1: 13
/dev/loop0p2: 13
/dev/loop0p3: 13
/dev/loop1: 14
/dev/loop1p1: 14
/dev/loop1p2: 14
/dev/loop2: 5
/dev/loop3: 6
Signed-off-by: Matteo Croce <[email protected]>
---
block/ioctl.c | 2 ++
include/uapi/linux/fs.h | 1 +
2 files changed, 3 insertions(+)
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ba1ed8defd0..32b339bbaf95 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -469,6 +469,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
BLKDEV_DISCARD_SECURE);
case BLKZEROOUT:
return blk_ioctl_zeroout(bdev, mode, arg);
+ case BLKGETDISKSEQ:
+ return put_u64(argp, bdev->bd_disk->diskseq);
case BLKREPORTZONE:
return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
case BLKRESETZONE:
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4c32e97dcdf0..bdf7b404b3e7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -184,6 +184,7 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
/*
* A jump here: 130-136 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
--
2.31.1
From: Matteo Croce <[email protected]>
Move the sysfs register code from a function named disk_add_events() to
a new function named disk_add_sysfs(). Also, rename the attribute list
with a more generic name than disk_events_attrs.
This is a prerequisite patch to export diskseq in sysfs later.
Signed-off-by: Matteo Croce <[email protected]>
---
block/genhd.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/block/genhd.c b/block/genhd.c
index c96b667136ee..610dd86fd4b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -39,6 +39,7 @@ static void disk_alloc_events(struct gendisk *disk);
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
+static void disk_add_sysfs(struct gendisk *disk);
void set_capacity(struct gendisk *disk, sector_t sectors)
{
@@ -560,6 +561,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));
+ disk_add_sysfs(disk);
disk_add_events(disk);
blk_integrity_add(disk);
}
@@ -1754,7 +1756,7 @@ static const DEVICE_ATTR(events_poll_msecs, 0644,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
-static const struct attribute *disk_events_attrs[] = {
+static const struct attribute *disk_sysfs_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
@@ -1825,13 +1827,16 @@ static void disk_alloc_events(struct gendisk *disk)
disk->ev = ev;
}
-static void disk_add_events(struct gendisk *disk)
+static void disk_add_sysfs(struct gendisk *disk)
{
/* FIXME: error handling */
- if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
+ if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs) < 0)
pr_warn("%s: failed to create sysfs files for events\n",
disk->disk_name);
+}
+static void disk_add_events(struct gendisk *disk)
+{
if (!disk->ev)
return;
@@ -1856,7 +1861,7 @@ static void disk_del_events(struct gendisk *disk)
mutex_unlock(&disk_events_mutex);
}
- sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+ sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs);
}
static void disk_release_events(struct gendisk *disk)
--
2.31.1
From: Matteo Croce <[email protected]>
Add a new sysfs handle to export the new diskseq value.
Place it in <sysfs>/block/<disk>/diskseq and document it.
$ grep . /sys/class/block/*/diskseq
/sys/class/block/loop0/diskseq:13
/sys/class/block/loop1/diskseq:14
/sys/class/block/loop2/diskseq:5
/sys/class/block/loop3/diskseq:6
/sys/class/block/ram0/diskseq:1
/sys/class/block/ram1/diskseq:2
/sys/class/block/vda/diskseq:7
Signed-off-by: Matteo Croce <[email protected]>
---
Documentation/ABI/testing/sysfs-block | 12 ++++++++++++
block/genhd.c | 11 +++++++++++
2 files changed, 23 insertions(+)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index e34cdeeeb9d4..a0ed87386639 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -28,6 +28,18 @@ Description:
For more details refer Documentation/admin-guide/iostats.rst
+What: /sys/block/<disk>/diskseq
+Date: February 2021
+Contact: Matteo Croce <[email protected]>
+Description:
+ The /sys/block/<disk>/diskseq files reports the disk
+ sequence number, which is a monotonically increasing
+ number assigned to every drive.
+ Some devices, like the loop device, refresh such number
+ every time the backing file is changed.
+ The value type is 64 bit unsigned.
+
+
What: /sys/block/<disk>/<part>/stat
Date: February 2008
Contact: Jerome Marchand <[email protected]>
diff --git a/block/genhd.c b/block/genhd.c
index 610dd86fd4b6..768d8d5d1eca 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1680,6 +1680,7 @@ static void disk_check_events(struct disk_events *ev,
* events_async : list of events which can be detected w/o polling
* (always empty, only for backwards compatibility)
* events_poll_msecs : polling interval, 0: disable, -1: system default
+ * diskseq : disk sequence number, since boot
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
@@ -1750,16 +1751,26 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev,
return count;
}
+static ssize_t diskseq_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%llu\n", disk->diskseq);
+}
+
static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, 0644,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
+static const DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static const struct attribute *disk_sysfs_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
+ &dev_attr_diskseq.attr,
NULL,
};
--
2.31.1
From: Matteo Croce <[email protected]>
Increment the disk sequence number when the media has changed,
i.e. on DISK_EVENT_MEDIA_CHANGE event.
$ cat /sys/class/block/sr0/diskseq
12
$ eject
$ cat /sys/class/block/sr0/diskseq
22
Signed-off-by: Matteo Croce <[email protected]>
---
block/genhd.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/block/genhd.c b/block/genhd.c
index 768d8d5d1eca..9d58e0ea18ae 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1657,6 +1657,9 @@ static void disk_check_events(struct disk_events *ev,
spin_unlock_irq(&ev->lock);
+ if (events & DISK_EVENT_MEDIA_CHANGE)
+ inc_diskseq(disk);
+
/*
* Tell userland about new events. Only the events listed in
* @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
--
2.31.1
From: Matteo Croce <[email protected]>
On a very loaded system, if there are many events queued up from multiple
attach/detach cycles, it's impossible to match them up with the
LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
of our own association in the queue is[1].
Not even an empty uevent queue is a reliable indication that we already
received the uevent we were waiting for, since with multi-partition block
devices each partition's event is queued asynchronously and might be
delivered later.
Increment the disk sequence number when setting or changing the backing
file, so the userspace knows which backing file generated the event:
# udevadm monitor -kp |grep -e ^DEVNAME -e ^DISKSEQ &
[1] 263
# losetup -fP 3part
[ 12.309974] loop0: detected capacity change from 0 to 2048
DEVNAME=/dev/loop0
DISKSEQ=8
[ 12.360252] loop0: p1 p2 p3
DEVNAME=/dev/loop0
DISKSEQ=8
DEVNAME=/dev/loop0p1
DISKSEQ=8
DEVNAME=/dev/loop0p2
DISKSEQ=8
DEVNAME=/dev/loop0p3
DISKSEQ=8
# losetup -D
DEVNAME=/dev/loop0
DISKSEQ=9
DEVNAME=/dev/loop0p1
DISKSEQ=9
DEVNAME=/dev/loop0p2
DISKSEQ=9
DEVNAME=/dev/loop0p3
DISKSEQ=9
# losetup -fP 2part
[ 29.001344] loop0: detected capacity change from 0 to 2048
DEVNAME=/dev/loop0
DISKSEQ=10
[ 29.040226] loop0: p1 p2
DEVNAME=/dev/loop0
DISKSEQ=10
DEVNAME=/dev/loop0p1
DISKSEQ=10
DEVNAME=/dev/loop0p2
DISKSEQ=10
[1] https://github.com/systemd/systemd/issues/17469#issuecomment-762919781
Signed-off-by: Matteo Croce <[email protected]>
---
drivers/block/loop.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 76e12f3482a9..b1c638d23306 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -735,6 +735,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
goto out_err;
/* and ... switch */
+ inc_diskseq(lo->lo_disk);
blk_mq_freeze_queue(lo->lo_queue);
mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
lo->lo_backing_file = file;
@@ -1123,6 +1124,8 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
if (error)
goto out_unlock;
+ inc_diskseq(lo->lo_disk);
+
if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
!file->f_op->write_iter)
lo->lo_flags |= LO_FLAGS_READ_ONLY;
@@ -1223,6 +1226,8 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
+ inc_diskseq(lo->lo_disk);
+
loop_release_xfer(lo);
lo->transfer = NULL;
lo->ioctl = NULL;
--
2.31.1
On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
> +void inc_diskseq(struct gendisk *disk)
> +{
> + static atomic64_t diskseq;
Please don't hide file scope variables in functions.
Can you explain a little more why we need a global sequence count vs
a per-disk one here?
> -static void disk_add_events(struct gendisk *disk)
> +static void disk_add_sysfs(struct gendisk *disk)
> {
> /* FIXME: error handling */
> - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
> + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs) < 0)
> pr_warn("%s: failed to create sysfs files for events\n",
> disk->disk_name);
> +}
Actually, what we need here is a way how we can setup the ->groups
field of the device to include all attribute groups instead of having
to call sysfs_create_files at all.
On Wed, Jun 23, 2021 at 12:58:58PM +0200, Matteo Croce wrote:
> From: Matteo Croce <[email protected]>
>
> On a very loaded system, if there are many events queued up from multiple
> attach/detach cycles, it's impossible to match them up with the
> LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
> of our own association in the queue is[1].
> Not even an empty uevent queue is a reliable indication that we already
> received the uevent we were waiting for, since with multi-partition block
> devices each partition's event is queued asynchronously and might be
> delivered later.
>
> Increment the disk sequence number when setting or changing the backing
> file, so the userspace knows which backing file generated the event:
Instead of manually incrementing the sequence here, can we make loop
generate the DISK_EVENT_MEDIA_CHANGE event on a backing device (aka
media) change?
On 6/23/21 12:58 PM, Matteo Croce wrote:
> From: Matteo Croce <[email protected]>
>
> With this series a monotonically increasing number is added to disks,
> precisely in the genhd struct, and it's exported in sysfs and uevent.
>
> This helps the userspace correlate events for devices that reuse the
> same device, like loop.
>
I'm failing to see the point here.
Apparently you are assuming that there is a userspace tool tracking
events, and has a need to correlate events related to different
instances of the disk.
But if you have an userspace application tracking events, why can't the
same application track the 'add' and 'remove' events to track the
lifetime of the devices, and implement its own numbering based on that?
Why do we need to burden the kernel with this?
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On Wed, 2021-06-23 at 14:03 +0200, Hannes Reinecke wrote:
> On 6/23/21 12:58 PM, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > With this series a monotonically increasing number is added to disks,
> > precisely in the genhd struct, and it's exported in sysfs and uevent.
> >
> > This helps the userspace correlate events for devices that reuse the
> > same device, like loop.
> >
> I'm failing to see the point here.
> Apparently you are assuming that there is a userspace tool tracking
> events, and has a need to correlate events related to different
> instances of the disk.
> But if you have an userspace application tracking events, why can't the
> same application track the 'add' and 'remove' events to track the
> lifetime of the devices, and implement its own numbering based on that?
>
> Why do we need to burden the kernel with this?
>
> Cheers,
>
> Hannes
Hi,
It is not an assumption, such tool does exist, and manually tracking
does not work because of the impossibility of reliably correlating
events to devices (we've tried, again and again and again), which is
the purpose of this series - to solve this long standing issue, which
has been causing problems both in testing and production for a long
time now, despite our best efforts to add workaround after workaround.
For more info please see the discussion on the v1:
https://lore.kernel.org/linux-fsdevel/[email protected]/t/#m5b03e48013de14b4a080c90afdc4a8b8c94c30d4
and the bug linked in the cover letter:
https://github.com/systemd/systemd/issues/17469#issuecomment-762919781
--
Kind regards,
Luca Boccassi
On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
>
> On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
> > +void inc_diskseq(struct gendisk *disk)
> > +{
> > + static atomic64_t diskseq;
>
> Please don't hide file scope variables in functions.
>
I just didn't want to clobber that file namespace, as that is the only
point where it's used.
> Can you explain a little more why we need a global sequence count vs
> a per-disk one here?
The point of the whole series is to have an unique sequence number for
all the disks.
Events can arrive to the userspace delayed or out-of-order, so this
helps to correlate events to the disk.
It might seem strange, but there isn't a way to do this yet, so I come
up with a global, monotonically incrementing number.
--
per aspera ad upstream
On Wed, 2021-06-23 at 12:57 +0100, Christoph Hellwig wrote:
> On Wed, Jun 23, 2021 at 12:58:58PM +0200, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > On a very loaded system, if there are many events queued up from multiple
> > attach/detach cycles, it's impossible to match them up with the
> > LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
> > of our own association in the queue is[1].
> > Not even an empty uevent queue is a reliable indication that we already
> > received the uevent we were waiting for, since with multi-partition block
> > devices each partition's event is queued asynchronously and might be
> > delivered later.
> >
> > Increment the disk sequence number when setting or changing the backing
> > file, so the userspace knows which backing file generated the event:
>
> Instead of manually incrementing the sequence here, can we make loop
> generate the DISK_EVENT_MEDIA_CHANGE event on a backing device (aka
> media) change?
Hi,
This was answered in the v1 thread:
https://lore.kernel.org/linux-fsdevel/[email protected]/t/#m8a677028572e826352cbb1e19d1b9c1f3b6bff4b
The fundamental issue is that we'd be back at trying to correlate
events to loopdev instances, which does not work reliably - hence this
patch series. With the new ioctl, we can get the id immediately and
without delay when we create the device, with no possible races. Then
we can handle events reliably, as we can correlate correctly in all
cases.
--
Kind regards,
Luca Boccassi
On Mi, 23.06.21 15:10, Matteo Croce ([email protected]) wrote:
> On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
> >
> > On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
> > > +void inc_diskseq(struct gendisk *disk)
> > > +{
> > > + static atomic64_t diskseq;
> >
> > Please don't hide file scope variables in functions.
> >
>
> I just didn't want to clobber that file namespace, as that is the only
> point where it's used.
>
> > Can you explain a little more why we need a global sequence count vs
> > a per-disk one here?
>
> The point of the whole series is to have an unique sequence number for
> all the disks.
> Events can arrive to the userspace delayed or out-of-order, so this
> helps to correlate events to the disk.
> It might seem strange, but there isn't a way to do this yet, so I come
> up with a global, monotonically incrementing number.
To extend on this and given an example why the *global* sequence number
matters:
Consider you plug in a USB storage key, and it gets named
/dev/sda. You unplug it, the kernel structures for that device all
disappear. Then you plug in a different USB storage key, and since
it's the only one it will too be called /dev/sda.
With the global sequence number we can still distinguish these two
devices even though otherwise they can look pretty much identical. If
we had per-device counters then this would fall flat because the
counter would be flushed out when the device disappears and when a device
reappears under the same generic name we couldn't assign it a
different sequence number than before.
Thus: a global instead of local sequence number counter is absolutely
*key* for the problem this is supposed to solve
Lennart
--
Lennart Poettering, Berlin
On 6/23/21 3:51 PM, Lennart Poettering wrote:
> On Mi, 23.06.21 15:10, Matteo Croce ([email protected]) wrote:
>
>> On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
>>>
>>> On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
>>>> +void inc_diskseq(struct gendisk *disk)
>>>> +{
>>>> + static atomic64_t diskseq;
>>>
>>> Please don't hide file scope variables in functions.
>>>
>>
>> I just didn't want to clobber that file namespace, as that is the only
>> point where it's used.
>>
>>> Can you explain a little more why we need a global sequence count vs
>>> a per-disk one here?
>>
>> The point of the whole series is to have an unique sequence number for
>> all the disks.
>> Events can arrive to the userspace delayed or out-of-order, so this
>> helps to correlate events to the disk.
>> It might seem strange, but there isn't a way to do this yet, so I come
>> up with a global, monotonically incrementing number.
>
> To extend on this and given an example why the *global* sequence number
> matters:
>
> Consider you plug in a USB storage key, and it gets named
> /dev/sda. You unplug it, the kernel structures for that device all
> disappear. Then you plug in a different USB storage key, and since
> it's the only one it will too be called /dev/sda.
>
> With the global sequence number we can still distinguish these two
> devices even though otherwise they can look pretty much identical. If
> we had per-device counters then this would fall flat because the
> counter would be flushed out when the device disappears and when a device
> reappears under the same generic name we couldn't assign it a
> different sequence number than before.
>
> Thus: a global instead of local sequence number counter is absolutely
> *key* for the problem this is supposed to solve
>
Well ... except that you'll need to keep track of the numbers (otherwise
you wouldn't know if the numbers changed, right?).
And if you keep track of the numbers you probably will have to implement
an uevent listener to get the events in time.
But if you have an uevent listener you will also get the add/remove
events for these devices.
And if you get add and remove events you can as well implement sequence
numbers in your application, seeing that you have all information
allowing you to do so.
So why burden the kernel with it?
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On Wed, 2021-06-23 at 16:01 +0200, Hannes Reinecke wrote:
> On 6/23/21 3:51 PM, Lennart Poettering wrote:
> > On Mi, 23.06.21 15:10, Matteo Croce ([email protected]) wrote:
> >
> > > On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
> > > > On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
> > > > > +void inc_diskseq(struct gendisk *disk)
> > > > > +{
> > > > > + static atomic64_t diskseq;
> > > >
> > > > Please don't hide file scope variables in functions.
> > > >
> > >
> > > I just didn't want to clobber that file namespace, as that is the only
> > > point where it's used.
> > >
> > > > Can you explain a little more why we need a global sequence count vs
> > > > a per-disk one here?
> > >
> > > The point of the whole series is to have an unique sequence number for
> > > all the disks.
> > > Events can arrive to the userspace delayed or out-of-order, so this
> > > helps to correlate events to the disk.
> > > It might seem strange, but there isn't a way to do this yet, so I come
> > > up with a global, monotonically incrementing number.
> >
> > To extend on this and given an example why the *global* sequence number
> > matters:
> >
> > Consider you plug in a USB storage key, and it gets named
> > /dev/sda. You unplug it, the kernel structures for that device all
> > disappear. Then you plug in a different USB storage key, and since
> > it's the only one it will too be called /dev/sda.
> >
> > With the global sequence number we can still distinguish these two
> > devices even though otherwise they can look pretty much identical. If
> > we had per-device counters then this would fall flat because the
> > counter would be flushed out when the device disappears and when a device
> > reappears under the same generic name we couldn't assign it a
> > different sequence number than before.
> >
> > Thus: a global instead of local sequence number counter is absolutely
> > *key* for the problem this is supposed to solve
> >
> Well ... except that you'll need to keep track of the numbers (otherwise
> you wouldn't know if the numbers changed, right?).
> And if you keep track of the numbers you probably will have to implement
> an uevent listener to get the events in time.
> But if you have an uevent listener you will also get the add/remove
> events for these devices.
> And if you get add and remove events you can as well implement sequence
> numbers in your application, seeing that you have all information
> allowing you to do so.
> So why burden the kernel with it?
>
> Cheers,
>
> Hannes
Hi,
We need this so that we can reliably correlate events to instances of a
device. Events alone cannot solve this problem, because events _are_
the problem.
--
Kind regards,
Luca Boccassi
On Mi, 23.06.21 14:03, Hannes Reinecke ([email protected]) wrote:
> On 6/23/21 12:58 PM, Matteo Croce wrote:
> > From: Matteo Croce <[email protected]>
> >
> > With this series a monotonically increasing number is added to disks,
> > precisely in the genhd struct, and it's exported in sysfs and uevent.
> >
> > This helps the userspace correlate events for devices that reuse the
> > same device, like loop.
> >
> I'm failing to see the point here.
> Apparently you are assuming that there is a userspace tool tracking events,
> and has a need to correlate events related to different instances of the
> disk.
> But if you have an userspace application tracking events, why can't the same
> application track the 'add' and 'remove' events to track the lifetime of the
> devices, and implement its own numbering based on that?
>
> Why do we need to burden the kernel with this?
The problem is that tracking the "add" and "remove" events is simply
not safely possibly right now for block devices whose names are
frequently reused.
Consider the loopback block device subsystem: whenever some tool wants
a loopback block device it will ask the kernel for one and the kernel
allocates from the bottom, hence /dev/loop0 is the most frequently
used loopback block device. If a large number of concurrently running
programs now repeatedly/quickly allocate/deallocate block devices they
all sooner or later get /dev/loop0. If they now want to watch the
"add" and "remove" uevents for that device for their own use of it
there's a very good chance they'll end up seeing the previous user's
"add" and "remove" events, as there's simply no way to associate the
uevents you see with *your* *own* use of /dev/loop0 right now, and
distinguish them from the uevent that might have been queued from a
prior use of /dev/loop0 and were just slow to be processed.
or to say this differently: loopback devices are named from a very
small, dense pool of names, and are frequently and quickly
reused. uevents are enqeued asynchronously and potentially take a long
time to reach the listeners (after all they have to travel through two
AF_NETLINK sockets and udev) and the only way to match up the device
uses and their uevents is by these kernel device names that are so
useless as a stable identifier.
This not only applies to loopback block devices, but many other block
device subsystems too. For example nbd allocates from the bottom, too,
i.e. /dev/nbd0 is the most like name to be used. And for SCSI devices
too: if you quickly plug/unplug/replug a bunch of USB sticks, you'll
likely always get /dev/sda...
Lennart
--
Lennart Poettering, Berlin
On Mi, 23.06.21 16:01, Hannes Reinecke ([email protected]) wrote:
> > Thus: a global instead of local sequence number counter is absolutely
> > *key* for the problem this is supposed to solve
> >
> Well ... except that you'll need to keep track of the numbers (otherwise you
> wouldn't know if the numbers changed, right?).
> And if you keep track of the numbers you probably will have to implement an
> uevent listener to get the events in time.
Hmm? This is backwards. The goal here is to be able to safely match up
uevents to specific uses of a block device, given that block device
names are agressively recycled.
you imply it was easy to know which device use a uevent belongs
to. But that's the problem: it is not possible to do so safely. if i
see a uevent for a block device "loop0" I cannot tell if it was from
my own use of the device or for some previous user of it.
And that's what we'd like to see fixed: i.e. we query the block device
for the seqeno now used and then we can use that to filter the uevents
and ignore the ones that do not carry the same sequence number as we
got assigned for our user.
Lennart
--
Lennart Poettering, Berlin
On 6/23/21 4:07 PM, Luca Boccassi wrote:
> On Wed, 2021-06-23 at 16:01 +0200, Hannes Reinecke wrote:
>> On 6/23/21 3:51 PM, Lennart Poettering wrote:
>>> On Mi, 23.06.21 15:10, Matteo Croce ([email protected]) wrote:
>>>
>>>> On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
>>>>> On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
>>>>>> +void inc_diskseq(struct gendisk *disk)
>>>>>> +{
>>>>>> + static atomic64_t diskseq;
>>>>>
>>>>> Please don't hide file scope variables in functions.
>>>>>
>>>>
>>>> I just didn't want to clobber that file namespace, as that is the only
>>>> point where it's used.
>>>>
>>>>> Can you explain a little more why we need a global sequence count vs
>>>>> a per-disk one here?
>>>>
>>>> The point of the whole series is to have an unique sequence number for
>>>> all the disks.
>>>> Events can arrive to the userspace delayed or out-of-order, so this
>>>> helps to correlate events to the disk.
>>>> It might seem strange, but there isn't a way to do this yet, so I come
>>>> up with a global, monotonically incrementing number.
>>>
>>> To extend on this and given an example why the *global* sequence number
>>> matters:
>>>
>>> Consider you plug in a USB storage key, and it gets named
>>> /dev/sda. You unplug it, the kernel structures for that device all
>>> disappear. Then you plug in a different USB storage key, and since
>>> it's the only one it will too be called /dev/sda.
>>>
>>> With the global sequence number we can still distinguish these two
>>> devices even though otherwise they can look pretty much identical. If
>>> we had per-device counters then this would fall flat because the
>>> counter would be flushed out when the device disappears and when a device
>>> reappears under the same generic name we couldn't assign it a
>>> different sequence number than before.
>>>
>>> Thus: a global instead of local sequence number counter is absolutely
>>> *key* for the problem this is supposed to solve
>>>
>> Well ... except that you'll need to keep track of the numbers (otherwise
>> you wouldn't know if the numbers changed, right?).
>> And if you keep track of the numbers you probably will have to implement
>> an uevent listener to get the events in time.
>> But if you have an uevent listener you will also get the add/remove
>> events for these devices.
>> And if you get add and remove events you can as well implement sequence
>> numbers in your application, seeing that you have all information
>> allowing you to do so.
>> So why burden the kernel with it?
>>
>> Cheers,
>>
>> Hannes
>
> Hi,
>
> We need this so that we can reliably correlate events to instances of a
> device. Events alone cannot solve this problem, because events _are_
> the problem.
>
In which sense?
Yes, events can be delayed (if you list to uevents), but if you listen
to kernel events there shouldn't be a delay, right?
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On Wed, Jun 23, 2021 at 02:13:25PM +0100, Luca Boccassi wrote:
> On Wed, 2021-06-23 at 12:57 +0100, Christoph Hellwig wrote:
> > On Wed, Jun 23, 2021 at 12:58:58PM +0200, Matteo Croce wrote:
> > > From: Matteo Croce <[email protected]>
> > >
> > > On a very loaded system, if there are many events queued up from multiple
> > > attach/detach cycles, it's impossible to match them up with the
> > > LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
> > > of our own association in the queue is[1].
> > > Not even an empty uevent queue is a reliable indication that we already
> > > received the uevent we were waiting for, since with multi-partition block
> > > devices each partition's event is queued asynchronously and might be
> > > delivered later.
> > >
> > > Increment the disk sequence number when setting or changing the backing
> > > file, so the userspace knows which backing file generated the event:
> >
> > Instead of manually incrementing the sequence here, can we make loop
> > generate the DISK_EVENT_MEDIA_CHANGE event on a backing device (aka
> > media) change?
>
> Hi,
>
> This was answered in the v1 thread:
>
> https://lore.kernel.org/linux-fsdevel/[email protected]/t/#m8a677028572e826352cbb1e19d1b9c1f3b6bff4b
>
> The fundamental issue is that we'd be back at trying to correlate
> events to loopdev instances, which does not work reliably - hence this
> patch series. With the new ioctl, we can get the id immediately and
> without delay when we create the device, with no possible races. Then
> we can handle events reliably, as we can correlate correctly in all
> cases.
I very much disagree with your reply there. The device now points to
a different media. Both for the loop device, a floppy or a CD changer
probably by some kind of user action. In the last cast it might even
by done entirely locally through a script just like the loop device.
On Wed, Jun 23, 2021 at 03:10:21PM +0200, Matteo Croce wrote:
> I just didn't want to clobber that file namespace, as that is the only
> point where it's used.
It doesn't really clobber the file namespace. Declaring it normally
at the top of the file makes it very clear we have global state.
Hiding it in a function just causes obsfucation.
> > Can you explain a little more why we need a global sequence count vs
> > a per-disk one here?
>
> The point of the whole series is to have an unique sequence number for
> all the disks.
> Events can arrive to the userspace delayed or out-of-order, so this
> helps to correlate events to the disk.
> It might seem strange, but there isn't a way to do this yet, so I come
> up with a global, monotonically incrementing number.
Maybe add a comment to explain that?
On Wed, 2021-06-23 at 16:21 +0200, Hannes Reinecke wrote:
> On 6/23/21 4:07 PM, Luca Boccassi wrote:
> > On Wed, 2021-06-23 at 16:01 +0200, Hannes Reinecke wrote:
> > > On 6/23/21 3:51 PM, Lennart Poettering wrote:
> > > > On Mi, 23.06.21 15:10, Matteo Croce ([email protected]) wrote:
> > > >
> > > > > On Wed, Jun 23, 2021 at 1:49 PM Christoph Hellwig <[email protected]> wrote:
> > > > > > On Wed, Jun 23, 2021 at 12:58:53PM +0200, Matteo Croce wrote:
> > > > > > > +void inc_diskseq(struct gendisk *disk)
> > > > > > > +{
> > > > > > > + static atomic64_t diskseq;
> > > > > >
> > > > > > Please don't hide file scope variables in functions.
> > > > > >
> > > > >
> > > > > I just didn't want to clobber that file namespace, as that is the only
> > > > > point where it's used.
> > > > >
> > > > > > Can you explain a little more why we need a global sequence count vs
> > > > > > a per-disk one here?
> > > > >
> > > > > The point of the whole series is to have an unique sequence number for
> > > > > all the disks.
> > > > > Events can arrive to the userspace delayed or out-of-order, so this
> > > > > helps to correlate events to the disk.
> > > > > It might seem strange, but there isn't a way to do this yet, so I come
> > > > > up with a global, monotonically incrementing number.
> > > >
> > > > To extend on this and given an example why the *global* sequence number
> > > > matters:
> > > >
> > > > Consider you plug in a USB storage key, and it gets named
> > > > /dev/sda. You unplug it, the kernel structures for that device all
> > > > disappear. Then you plug in a different USB storage key, and since
> > > > it's the only one it will too be called /dev/sda.
> > > >
> > > > With the global sequence number we can still distinguish these two
> > > > devices even though otherwise they can look pretty much identical. If
> > > > we had per-device counters then this would fall flat because the
> > > > counter would be flushed out when the device disappears and when a device
> > > > reappears under the same generic name we couldn't assign it a
> > > > different sequence number than before.
> > > >
> > > > Thus: a global instead of local sequence number counter is absolutely
> > > > *key* for the problem this is supposed to solve
> > > >
> > > Well ... except that you'll need to keep track of the numbers (otherwise
> > > you wouldn't know if the numbers changed, right?).
> > > And if you keep track of the numbers you probably will have to implement
> > > an uevent listener to get the events in time.
> > > But if you have an uevent listener you will also get the add/remove
> > > events for these devices.
> > > And if you get add and remove events you can as well implement sequence
> > > numbers in your application, seeing that you have all information
> > > allowing you to do so.
> > > So why burden the kernel with it?
> > >
> > > Cheers,
> > >
> > > Hannes
> >
> > Hi,
> >
> > We need this so that we can reliably correlate events to instances of a
> > device. Events alone cannot solve this problem, because events _are_
> > the problem.
> >
> In which sense?
> Yes, events can be delayed (if you list to uevents), but if you listen
> to kernel events there shouldn't be a delay, right?
>
> Cheers,
>
> Hannes
Hi,
Userspace programs don't have exclusive usage rights on loopdev, so you
can't reliably know if an uevent correlates to the loop0 you just
added, or to the loop0 someone else added some time earlier. This
series lets us do just that, reliably, without races, and fix long-
standing bugs. Please see Lennart's reply, it has much more details.
Thanks!
--
Kind regards,
Luca Boccassi
On Mi, 23.06.21 16:21, Hannes Reinecke ([email protected]) wrote:
> > We need this so that we can reliably correlate events to instances of a
> > device. Events alone cannot solve this problem, because events _are_
> > the problem.
> >
> In which sense?
> Yes, events can be delayed (if you list to uevents), but if you listen to
> kernel events there shouldn't be a delay, right?
uevents are delivered to userpace via an AF_NETLINK socket. The
AF_NETLINK socket is basically an asynchronous buffer.
I mean, consider what you are saying: you establish the AF_NETLINK
uevent watching socket, then you allocate /dev/loop0. Since you cannot
do that atomically, you'll first have to do one, and then the
other. But if you do that in two steps, then in the middle some other
process might get scheduled that quickly allocates /dev/loop0 and
releases it again, before your code gets to run. So now you have in
your AF_NETLINK socket buffer the uevents for that other process' use
of the device, and you cannot sanely distinguish them from your own.
of course you could do it the other way: allocate the device first,
and only then allocate the AF_NETLINK uevent socket. But then you
might or might not lose the "add" event for the device you just
allocated. And you don't know if you should wait for it or not.
This isn't even a constructed issue, this is the common case if you
have multiple processes all simultaneously trying to acquire a loopback
block device, because they all will end up eying /dev/loop0 at the same time.
But it gets worse IRL because of various factors. For example,
partition probing is asynchronous, so if you use LO_FLAGS_PARTSCAN and
want to watch for some partition device associated to your loopback
block device to show up, this can take *really* long, so the race
window is large. Or you actually use udev (like most userspace
probably should) because you want the metainfo it collects about the
device, in which case it will take even longer for the uevent to reach
you, i.e. the time window where a previous user's uevents and your own
for the same loopback device "overlap" can be quite large and you
cannot determine if they are yours or the previous user's uevents —
unless we have these new sequence numbers.
Lennart
--
Lennart Poettering, Berlin
On 6/23/21 4:12 PM, Lennart Poettering wrote:
> On Mi, 23.06.21 16:01, Hannes Reinecke ([email protected]) wrote:
>
>>> Thus: a global instead of local sequence number counter is absolutely
>>> *key* for the problem this is supposed to solve
>>>
>> Well ... except that you'll need to keep track of the numbers (otherwise you
>> wouldn't know if the numbers changed, right?).
>> And if you keep track of the numbers you probably will have to implement an
>> uevent listener to get the events in time.
>
> Hmm? This is backwards. The goal here is to be able to safely match up
> uevents to specific uses of a block device, given that block device
> names are agressively recycled.
>
> you imply it was easy to know which device use a uevent belongs
> to. But that's the problem: it is not possible to do so safely. if i
> see a uevent for a block device "loop0" I cannot tell if it was from
> my own use of the device or for some previous user of it.
>
> And that's what we'd like to see fixed: i.e. we query the block device
> for the seqeno now used and then we can use that to filter the uevents
> and ignore the ones that do not carry the same sequence number as we
> got assigned for our user.
>
It is notoriously tricky to monitor the intended use-case for kernel
devices, precisely because we do _not_ attach any additional information
to it.
I have send a proposal for LSF to implement block-namespaces, the prime
use-case of which is indeed attaching cgroup/namespace information to
block devices such that we _can_ match (block) devices to specific contexts.
Which I rather prefer than adding sequence numbers to block devices;
incidentally you could solve the same problem by _not_ reusing numbers
aggressively but rather allocate the next free one after the most
recently allocated one.
Will give you much the same thing without having to burden others with it.
The better alternative here would be to extend the loop ioctl to pass in
an UUID when allocating the device.
That way you can easily figure out whether the loop device has been
modified.
But in the end, it's the loop driver and I'm not particular bothered
with it. I am, though, if you need to touch all drivers just to support
one particular use-case in one particular device driver.
Incidentally, we don't have this problem in SCSI as we _can_ identify
devices here. So in the end we couldn't care less on which /dev/sdX
device it ends up.
And I guess that's what we should attempt for loop devices, too.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
On Mi, 23.06.21 15:25, Christoph Hellwig ([email protected]) wrote:
> On Wed, Jun 23, 2021 at 02:13:25PM +0100, Luca Boccassi wrote:
> > On Wed, 2021-06-23 at 12:57 +0100, Christoph Hellwig wrote:
> > > On Wed, Jun 23, 2021 at 12:58:58PM +0200, Matteo Croce wrote:
> > > > From: Matteo Croce <[email protected]>
> > > >
> > > > On a very loaded system, if there are many events queued up from multiple
> > > > attach/detach cycles, it's impossible to match them up with the
> > > > LOOP_CONFIGURE or LOOP_SET_FD call, since we don't know where the position
> > > > of our own association in the queue is[1].
> > > > Not even an empty uevent queue is a reliable indication that we already
> > > > received the uevent we were waiting for, since with multi-partition block
> > > > devices each partition's event is queued asynchronously and might be
> > > > delivered later.
> > > >
> > > > Increment the disk sequence number when setting or changing the backing
> > > > file, so the userspace knows which backing file generated the event:
> > >
> > > Instead of manually incrementing the sequence here, can we make loop
> > > generate the DISK_EVENT_MEDIA_CHANGE event on a backing device (aka
> > > media) change?
> >
> > Hi,
> >
> > This was answered in the v1 thread:
> >
> > https://lore.kernel.org/linux-fsdevel/[email protected]/t/#m8a677028572e826352cbb1e19d1b9c1f3b6bff4b
> >
> > The fundamental issue is that we'd be back at trying to correlate
> > events to loopdev instances, which does not work reliably - hence this
> > patch series. With the new ioctl, we can get the id immediately and
> > without delay when we create the device, with no possible races. Then
> > we can handle events reliably, as we can correlate correctly in all
> > cases.
>
> I very much disagree with your reply there. The device now points to
> a different media. Both for the loop device, a floppy or a CD changer
> probably by some kind of user action. In the last cast it might even
> by done entirely locally through a script just like the loop device.
I am not sure I grok your point.
but let me try to explain why I think it's better to make media
changes *also* trigger seqno changes, and not make media change events
the *only* way to trigger seqno changes.
1. First of all, loopback devices currently don't hook into the media
change logic (which so far is focussed on time-based polling
actually, for both CDs and floppies). Adding this would change
semantics visibly to userspace (since userspace would suddenly see
another action=change + DISK_MEDIA_CHANGE=1 uevent suddenly that it
needs to handle correctly). One can certainly argue that userspace
must be ready to get additional uevents like this any time, but
knowing userspace a bit I am pretty sure this will confuse some
userspace that doesn't expect this. I mean loopback block devices
already issue "change" uevents on attachment and detachment, one
that userpace typically expects, but adding the media change one
probably means sending two (?) of these out for each
attachment. One being the old one from the loopback device itself,
and then another one for the media change from the mdia change
logic. That's not just noisy, but also ugly.
2. We want seqnums to be allocated for devices not only when doing
media change (e.g. when attaching or detaching a loopback device)
but also when allocating a block device, so that even before the
first media change event a block device has a sequence number. This
means allocating a sequence number for block devices won't be
limited to the media change code anyway.
3. Doing the sequence number bumping in media change code exclusively
kinda suggests this was something we could properly abstract away,
to be done only there, and that the rest of the block subsystems
wouldn#t have to bother much. But I am pretty sure that's not
correct: in particular in the loopback block device code (but in
other block subsystems too I guess) we really want to be able to
atomically attach a loopback block device and return the seqnum of
that very attachmnt so that we can focus on uevents for it. Thus,
attachment, allocation and returning the seqnum to userspace in the
LOOP_CONFIGURE ioctl (or whatever is appropriate) kinda go hand in
hand.
4. The media change protocol follows a protocol along with the eject
button handling (DISK_EVENT_EJECT_REQUEST), the locking of the tray
and the time based polling. i.e. it's specifically focussed on
certain hw features, none of which really apply to loopback block
devices, which have no trays to lock, but eject buttons to handle,
and where media change is always triggered internally by privileged
code, never externally by the user. I doubt it makes sense to mix
these up. Currently udev+udisks in userspace implement that
procotol for cdroms/floppies, and I doubt we would want to bother
to extend this for loopback devices in userspace. In fact, if media
change events are added to loopback devices, we probably would have
to change userspace to ignore them.
TLDR: making loopback block devices generate media is a (probably
minor, but unclear) API break, probably means duplicating uevent
traffic for it, won't abstract things away anyway, won't allocate a
seqnum on device allocation, and won't actually use anything of the
real media change functionality.
Or to say this differently: if the goal is to make loopback block
devices to send CDROM compatible media change events to userspace,
then I think it would probably make more sense to attach the
DISK_MEDIA_CHANGE=1 property to the attachment/detachment uevents the
loopback devices *already* generate, rather than to try to shoehorn the
existing media change infrastructure into the loopback device logic,
trying to reuse it even though nothing of it is really needed.
But that said, I am not aware of anyone ever asking for CDROM
compatible EDISK_MEDIA_CHANGE=1 uevents for loopback devices. I really
wouldn't bother with that. Sounds like nothing anyone want with a
chance of breaking things everywhere. (i.e. remember the
"bind"/"unbind" uevent fiasco?)
Lennart
--
Lennart Poettering, Berlin
On Wed, 2021-06-23 at 17:02 +0200, Hannes Reinecke wrote:
> On 6/23/21 4:12 PM, Lennart Poettering wrote:
> > On Mi, 23.06.21 16:01, Hannes Reinecke ([email protected]) wrote:
> >
> > > > Thus: a global instead of local sequence number counter is absolutely
> > > > *key* for the problem this is supposed to solve
> > > >
> > > Well ... except that you'll need to keep track of the numbers (otherwise you
> > > wouldn't know if the numbers changed, right?).
> > > And if you keep track of the numbers you probably will have to implement an
> > > uevent listener to get the events in time.
> >
> > Hmm? This is backwards. The goal here is to be able to safely match up
> > uevents to specific uses of a block device, given that block device
> > names are agressively recycled.
> >
> > you imply it was easy to know which device use a uevent belongs
> > to. But that's the problem: it is not possible to do so safely. if i
> > see a uevent for a block device "loop0" I cannot tell if it was from
> > my own use of the device or for some previous user of it.
> >
> > And that's what we'd like to see fixed: i.e. we query the block device
> > for the seqeno now used and then we can use that to filter the uevents
> > and ignore the ones that do not carry the same sequence number as we
> > got assigned for our user.
> >
>
> It is notoriously tricky to monitor the intended use-case for kernel
> devices, precisely because we do _not_ attach any additional information
> to it.
> I have send a proposal for LSF to implement block-namespaces, the prime
> use-case of which is indeed attaching cgroup/namespace information to
> block devices such that we _can_ match (block) devices to specific contexts.
Having namespaces for block devices would be an awesome feature, very
much looking forward to have that, as it solves a number of other
issues we have.
And while it could maybe be used in some instances of this particular
problem, unfortunately I don't think it can solve all of them - in some
real cases, we have to work in the root namespace as we are setting
things up for it.
> Which I rather prefer than adding sequence numbers to block devices;
> incidentally you could solve the same problem by _not_ reusing numbers
> aggressively but rather allocate the next free one after the most
> recently allocated one.
> Will give you much the same thing without having to burden others with it.
If I understood this right, you are proposing to move the
monothonically increasing sequence id to the device name, rather than
as internal metadata? So that, eg, loop0 gets used exactly once and
never again, and so on? Wouldn't that be a much more visible and
disruptive change, potentially backward incompatible and breaking
userspace left and right?
> The better alternative here would be to extend the loop ioctl to pass in
> an UUID when allocating the device.
> That way you can easily figure out whether the loop device has been
> modified.
A UUID solves the problem we are currently talking about. But a
monothonically increasing sequence number has additional great
properties compared to a UUID, that we very much want to make use of
(again these are all _real_ use cases), and were described in detail
here:
https://lore.kernel.org/linux-fsdevel/[email protected]/t/#m3b1ffdffcc70a7c3ef4d7f13d0c2d5b9e4dde35a
> But in the end, it's the loop driver and I'm not particular bothered
> with it. I am, though, if you need to touch all drivers just to support
> one particular use-case in one particular device driver.
>
> Incidentally, we don't have this problem in SCSI as we _can_ identify
> devices here. So in the end we couldn't care less on which /dev/sdX
> device it ends up.
> And I guess that's what we should attempt for loop devices, too.
Sorry, I'm not sure what you mean here by "touch all drivers"? This
series changes only drivers/block/loop.c, no other device drivers code
is touched?
--
Kind regards,
Luca Boccassi
On Mi, 23.06.21 17:02, Hannes Reinecke ([email protected]) wrote:
> > you imply it was easy to know which device use a uevent belongs
> > to. But that's the problem: it is not possible to do so safely. if i
> > see a uevent for a block device "loop0" I cannot tell if it was from
> > my own use of the device or for some previous user of it.
> >
> > And that's what we'd like to see fixed: i.e. we query the block device
> > for the seqeno now used and then we can use that to filter the uevents
> > and ignore the ones that do not carry the same sequence number as we
> > got assigned for our user.
>
> It is notoriously tricky to monitor the intended use-case for kernel
> devices, precisely because we do _not_ attach any additional information to
> it.
> I have send a proposal for LSF to implement block-namespaces, the prime
> use-case of which is indeed attaching cgroup/namespace information to block
> devices such that we _can_ match (block) devices to specific
> contexts.
The goal of the patchset is to make loopback block devices (and
similar) safely and robustly concurrently allocatable from the main OS
namespace, without any cgroup/containerization logic.
In systemd we want to be able to allocate loopback block devices from
any context, and concurrently without having to set up a
cgroup/namespace first for each user for it. Any approach that binds
two distinct subsystems like this together (e.g. "you need to set up
cgroups to safely allocate loopback block devices") is really
problematic for us, since we manage both, but independently and always
with minimal privileges.
> Which I rather prefer than adding sequence numbers to block devices;
> incidentally you could solve the same problem by _not_ reusing numbers
> aggressively but rather allocate the next free one after the most recently
> allocated one.
You are suggesting that instead of allocating loopback block devices
always from the "bottom", i.e. always handing out from "loop0" on,
with the lowest preferred we'd just always hand out "loop1", "loop2",
… with strictly monotonically increasing numbres and never reuse
"loop0" anymore and other names we already handed out? That would
certainly work, but this would require quite some kernel rework, since
the loopbck allocation API is really not designed to work like that
right now.
Moreover, the proposed sequence number stuff also covers
floppies/cdroms and other stuff nicely, i.e. where drives stick around
but their media changes. Also, USB sticks are currently effectively
always called /dev/sda. It would be great to be able to distinguish
each plug/replug too. Of course, you could argue that there too
/dev/sda should never be reused, but strictly monotonically increasing
/dev/sdb, /dev/sdc, … and so on, and I'd sympathize with that, but
that makes it a major kernel rework, because basically every block
subsystem would have to be reworked to never reuse block device names
anymore.
Also, i doubt people would be happy if they then regularly would have
to deal with device names such as /dev/loop84763874658743 or
/dev/sdzbghz just because their system has been running for a while.
> The better alternative here would be to extend the loop ioctl to pass in an
> UUID when allocating the device.
> That way you can easily figure out whether the loop device has been
> modified.
UUIDs instead of sequence numbers would mostly solve our probelms
too. i.e. chaotic, randomized assignment of identifiers instead of
linearly progressing assignment of idenitifers. However I prefer
sequence numbers as discussed in this thread before: they allow us to
derive ordering from things: thus if you see an uevent with a seqnum
smaller than the one you are interested in you know its worth waiting
for the ones you are looking for to appear. But if you see a uevent
with a seqnum greater than the one you are interested in then you know
it's pointless to wait, the device has already been acquired by
someone else. With randomized UUIDs you can't know that, since uses by
other participants are only recognizable as distinct from your own but
don't tell you if they are earlier or later than your own. After all
the AF_NETLINK/uevent socket is lossy, so you must be prepared for
dropped messages, hence it's reat if we can easily resync when your
own messages get dropped.
Lennart
--
Lennart Poettering, Berlin
On Wed, Jun 23, 2021 at 1:53 PM Christoph Hellwig <[email protected]> wrote:
>
> > -static void disk_add_events(struct gendisk *disk)
> > +static void disk_add_sysfs(struct gendisk *disk)
> > {
> > /* FIXME: error handling */
> > - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
> > + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs) < 0)
> > pr_warn("%s: failed to create sysfs files for events\n",
> > disk->disk_name);
> > +}
>
> Actually, what we need here is a way how we can setup the ->groups
> field of the device to include all attribute groups instead of having
> to call sysfs_create_files at all.
I don't get this one. You mean in general or in this series?
--
per aspera ad upstream
On Wed, Jun 23, 2021 at 05:29:17PM +0200, Lennart Poettering wrote:
> I am not sure I grok your point.
You don't.
> 1. First of all, loopback devices currently don't hook into the media
> change logic (which so far is focussed on time-based polling
> actually, for both CDs and floppies).
And that is the whole problem. We need to first fix loop devices to
hook into the existing mechanism to report media changes. We can then
enhance that mechanism to be more suitable to loop (avoid the polling)
and userspace (add a sequence number). But if we don't have the basic
agreement to fully integreat loop with the existing way that the kernel
reports media change we don't even need to discuss this series and can
just ignore it, as it simply won't be acceptable.
> Adding this would change
> semantics visibly to userspace (since userspace would suddenly see
> another action=change + DISK_MEDIA_CHANGE=1 uevent suddenly that it
> needs to handle correctly).
Yes, and that is a good thing as loop is currently completely broken
in this respect.
> 2. We want seqnums to be allocated for devices not only when doing
> media change (e.g. when attaching or detaching a loopback device)
> but also when allocating a block device, so that even before the
> first media change event a block device has a sequence number. This
> means allocating a sequence number for block devices won't be
> limited to the media change code anyway.
Doing this on creation is fine, and attaching is by definition a media
change.
On Wed, Jun 23, 2021 at 09:03:40PM +0200, Matteo Croce wrote:
> On Wed, Jun 23, 2021 at 1:53 PM Christoph Hellwig <[email protected]> wrote:
> >
> > > -static void disk_add_events(struct gendisk *disk)
> > > +static void disk_add_sysfs(struct gendisk *disk)
> > > {
> > > /* FIXME: error handling */
> > > - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
> > > + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_sysfs_attrs) < 0)
> > > pr_warn("%s: failed to create sysfs files for events\n",
> > > disk->disk_name);
> > > +}
> >
> > Actually, what we need here is a way how we can setup the ->groups
> > field of the device to include all attribute groups instead of having
> > to call sysfs_create_files at all.
>
> I don't get this one. You mean in general or in this series?
In general before we make more use of the block device provided attrs.