Ext4 has an optimization mechanism for batched disacrd (FITRIM) that
should help speed up subsequent calls of FITRIM ioctl by skipping the
groups that were previously trimmed. However because the FITRIM allows
to set the minimum size of an extent to trim, ext4 stores the last
minimum extent size and only avoids trimming the group if it was
previously trimmed with minimum extent size equal to, or smaller than
the current call.
There is currently no way to bypass the optimization without
umount/mount cycle. This becomes a problem when the file system is
live migrated to a different storage, because the optimization will
prevent possibly useful discard calls to the storage.
Fix it by exporting the s_last_trim_minblks via sysfs interface which
will allow us to set the minimum size to the number of blocks larger
than subsequent FITRIM call, effectively bypassing the optimization.
By setting the s_last_trim_minblks to INT_MAX the optimization will be
effectively cleared regardless of the previous state, or file system
configuration.
For example:
echo 2147483647 > /sys/fs/ext4/dm-1/last_trim_minblks
Signed-off-by: Lukas Czerner <[email protected]>
Reported-by: Laurent GUERBY <[email protected]>
---
fs/ext4/sysfs.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 2314f7446592..72d2a2c61a44 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -187,6 +187,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+#define EXT4_RW_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -245,6 +248,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
+EXT4_RW_ATTR_SBI_ATOMIC(last_trim_minblks, s_last_trim_minblks);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -295,6 +299,7 @@ static struct attribute *ext4_attrs[] = {
#endif
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
+ ATTR_LIST(last_trim_minblks),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -474,6 +479,15 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
return ret;
*((unsigned long *) ptr) = t;
return len;
+ case attr_pointer_atomic:
+ if (!ptr)
+ return 0;
+ ret = kstrtoint(skip_spaces(buf), 0, (int *)&t);
+ if (ret)
+ return ret;
+ *((unsigned long *) ptr) = t;
+ atomic_set((atomic_t *) ptr, t);
+ return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
--
2.31.1
Ext4 has an optimization mechanism for batched disacrd (FITRIM) that
should help speed up subsequent calls of FITRIM ioctl by skipping the
groups that were previously trimmed. However because the FITRIM allows
to set the minimum size of an extent to trim, ext4 stores the last
minimum extent size and only avoids trimming the group if it was
previously trimmed with minimum extent size equal to, or smaller than
the current call.
There is currently no way to bypass the optimization without
umount/mount cycle. This becomes a problem when the file system is
live migrated to a different storage, because the optimization will
prevent possibly useful discard calls to the storage.
Fix it by exporting the s_last_trim_minblks via sysfs interface which
will allow us to set the minimum size to the number of blocks larger
than subsequent FITRIM call, effectively bypassing the optimization.
By setting the s_last_trim_minblks to INT_MAX the optimization will be
effectively cleared regardless of the previous state, or file system
configuration.
For example:
echo 2147483647 > /sys/fs/ext4/dm-1/last_trim_minblks
Signed-off-by: Lukas Czerner <[email protected]>
Reported-by: Laurent GUERBY <[email protected]>
---
v2: Remove unnecessary assignment
fs/ext4/sysfs.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 2314f7446592..94c86eb8d3cc 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -187,6 +187,9 @@ static struct ext4_attr ext4_attr_##_name = { \
#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+#define EXT4_RW_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -245,6 +248,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
+EXT4_RW_ATTR_SBI_ATOMIC(last_trim_minblks, s_last_trim_minblks);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -295,6 +299,7 @@ static struct attribute *ext4_attrs[] = {
#endif
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
+ ATTR_LIST(last_trim_minblks),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -474,6 +479,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
return ret;
*((unsigned long *) ptr) = t;
return len;
+ case attr_pointer_atomic:
+ if (!ptr)
+ return 0;
+ ret = kstrtoint(skip_spaces(buf), 0, (int *)&t);
+ if (ret)
+ return ret;
+ atomic_set((atomic_t *) ptr, t);
+ return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
--
2.31.1
On Nov 2, 2021, at 06:19, Lukas Czerner <[email protected]> wrote:
>
> Ext4 has an optimization mechanism for batched disacrd (FITRIM) that
> should help speed up subsequent calls of FITRIM ioctl by skipping the
> groups that were previously trimmed. However because the FITRIM allows
> to set the minimum size of an extent to trim, ext4 stores the last
> minimum extent size and only avoids trimming the group if it was
> previously trimmed with minimum extent size equal to, or smaller than
> the current call.
>
> There is currently no way to bypass the optimization without
> umount/mount cycle. This becomes a problem when the file system is
> live migrated to a different storage, because the optimization will
> prevent possibly useful discard calls to the storage.
>
> Fix it by exporting the s_last_trim_minblks via sysfs interface which
> will allow us to set the minimum size to the number of blocks larger
> than subsequent FITRIM call, effectively bypassing the optimization.
>
> By setting the s_last_trim_minblks to INT_MAX the optimization will be
> effectively cleared regardless of the previous state, or file system
> configuration.
>
> For example:
> echo 2147483647 > /sys/fs/ext4/dm-1/last_trim_minblks
>
> Signed-off-by: Lukas Czerner <[email protected]>
> Reported-by: Laurent GUERBY <[email protected]>
The patch itself looks fine, but it isn't really clear why this field is atomic_t?
There is no particular data integrity needed for this field between threads
and it is not harmful if it is set or read in a racy manner (at most it is set at
the end of ext4_trim_fs() once per filesystem trim).
So it seems like this could be a __u32 (like s_clusters_per_group, which is
the largest possible value it can store)
and avoid the need to add a new type
to the sysfs attribute handling?
Cheers, Andreas
> ---
> v2: Remove unnecessary assignment
>
> fs/ext4/sysfs.c | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 2314f7446592..94c86eb8d3cc 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -187,6 +187,9 @@ static struct ext4_attr ext4_attr_##_name = { \
> #define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
> EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
>
> +#define EXT4_RW_ATTR_SBI_ATOMIC(_name,_elname) \
> + EXT4_ATTR_OFFSET(_name, 0644, pointer_atomic, ext4_sb_info, _elname)
> +
> #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
> static struct ext4_attr ext4_attr_##_name = { \
> .attr = {.name = __stringify(_name), .mode = _mode }, \
> @@ -245,6 +248,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
> EXT4_ATTR(journal_task, 0444, journal_task);
> EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
> +EXT4_RW_ATTR_SBI_ATOMIC(last_trim_minblks, s_last_trim_minblks);
>
> static unsigned int old_bump_val = 128;
> EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -295,6 +299,7 @@ static struct attribute *ext4_attrs[] = {
> #endif
> ATTR_LIST(mb_prefetch),
> ATTR_LIST(mb_prefetch_limit),
> + ATTR_LIST(last_trim_minblks),
> NULL,
> };
> ATTRIBUTE_GROUPS(ext4);
> @@ -474,6 +479,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
> return ret;
> *((unsigned long *) ptr) = t;
> return len;
> + case attr_pointer_atomic:
> + if (!ptr)
> + return 0;
> + ret = kstrtoint(skip_spaces(buf), 0, (int *)&t);
> + if (ret)
> + return ret;
> + atomic_set((atomic_t *) ptr, t);
> + return len;
> case attr_inode_readahead:
> return inode_readahead_blks_store(sbi, buf, len);
> case attr_trigger_test_error:
> --
> 2.31.1
>
On Tue, Nov 02, 2021 at 04:44:30PM -0800, Andreas Dilger wrote:
> On Nov 2, 2021, at 06:19, Lukas Czerner <[email protected]> wrote:
> >
> > Ext4 has an optimization mechanism for batched disacrd (FITRIM) that
> > should help speed up subsequent calls of FITRIM ioctl by skipping the
> > groups that were previously trimmed. However because the FITRIM allows
> > to set the minimum size of an extent to trim, ext4 stores the last
> > minimum extent size and only avoids trimming the group if it was
> > previously trimmed with minimum extent size equal to, or smaller than
> > the current call.
> >
> > There is currently no way to bypass the optimization without
> > umount/mount cycle. This becomes a problem when the file system is
> > live migrated to a different storage, because the optimization will
> > prevent possibly useful discard calls to the storage.
> >
> > Fix it by exporting the s_last_trim_minblks via sysfs interface which
> > will allow us to set the minimum size to the number of blocks larger
> > than subsequent FITRIM call, effectively bypassing the optimization.
> >
> > By setting the s_last_trim_minblks to INT_MAX the optimization will be
> > effectively cleared regardless of the previous state, or file system
> > configuration.
> >
> > For example:
> > echo 2147483647 > /sys/fs/ext4/dm-1/last_trim_minblks
> >
> > Signed-off-by: Lukas Czerner <[email protected]>
> > Reported-by: Laurent GUERBY <[email protected]>
>
> The patch itself looks fine, but it isn't really clear why this field is atomic_t?
> There is no particular data integrity needed for this field between threads
> and it is not harmful if it is set or read in a racy manner (at most it is set at
> the end of ext4_trim_fs() once per filesystem trim).
Indeed I was asking myself the same question, but I didn't have the need
to change it. There does not seem to be any good reason for it to be
atomic_t.
>
> So it seems like this could be a __u32 (like s_clusters_per_group, which is
> the largest possible value it can store)
> and avoid the need to add a new type
> to the sysfs attribute handling?
Sure I can change it. Although the s_clusters_per_group in sbi structure
is unsigned long and that's what we check it against.
Thanks!
-Lukas
>
> Cheers, Andreas
>
>
> > ---
> > v2: Remove unnecessary assignment
> >
> > fs/ext4/sysfs.c | 13 +++++++++++++
> > 1 file changed, 13 insertions(+)
> >
> > diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> > index 2314f7446592..94c86eb8d3cc 100644
> > --- a/fs/ext4/sysfs.c
> > +++ b/fs/ext4/sysfs.c
> > @@ -187,6 +187,9 @@ static struct ext4_attr ext4_attr_##_name = { \
> > #define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
> > EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
> >
> > +#define EXT4_RW_ATTR_SBI_ATOMIC(_name,_elname) \
> > + EXT4_ATTR_OFFSET(_name, 0644, pointer_atomic, ext4_sb_info, _elname)
> > +
> > #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
> > static struct ext4_attr ext4_attr_##_name = { \
> > .attr = {.name = __stringify(_name), .mode = _mode }, \
> > @@ -245,6 +248,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time);
> > EXT4_ATTR(journal_task, 0444, journal_task);
> > EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
> > EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
> > +EXT4_RW_ATTR_SBI_ATOMIC(last_trim_minblks, s_last_trim_minblks);
> >
> > static unsigned int old_bump_val = 128;
> > EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> > @@ -295,6 +299,7 @@ static struct attribute *ext4_attrs[] = {
> > #endif
> > ATTR_LIST(mb_prefetch),
> > ATTR_LIST(mb_prefetch_limit),
> > + ATTR_LIST(last_trim_minblks),
> > NULL,
> > };
> > ATTRIBUTE_GROUPS(ext4);
> > @@ -474,6 +479,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
> > return ret;
> > *((unsigned long *) ptr) = t;
> > return len;
> > + case attr_pointer_atomic:
> > + if (!ptr)
> > + return 0;
> > + ret = kstrtoint(skip_spaces(buf), 0, (int *)&t);
> > + if (ret)
> > + return ret;
> > + atomic_set((atomic_t *) ptr, t);
> > + return len;
> > case attr_inode_readahead:
> > return inode_readahead_blks_store(sbi, buf, len);
> > case attr_trigger_test_error:
> > --
> > 2.31.1
> >
>