Remount process will release system zone which was allocated before if
"noblock_validity" is specified. If we mount an ext4 file system to two
mountpoints whit default mount options, and then remount one of them
with "noblock_validity", it may trigger a use after free problem when
someone accessing the other one.
# mount /dev/sda foo
# mount /dev/sda bar
User access mountpoint "foo" | Remount mountpoint "bar"
|
ext4_map_blocks() | ext4_remount()
check_block_validity() | ext4_setup_system_zone()
ext4_data_block_valid() | ext4_release_system_zone()
| free system_blks rb nodes
access system_blks rb nodes |
trigger use after free |
This patch lock the system zone when accessing it to prevent it being
released when doing a remount with "noblock_validity" mount option.
Signed-off-by: zhangyi (F) <[email protected]>
Cc: [email protected]
---
fs/ext4/block_validity.c | 21 +++++++++++++++++++--
fs/ext4/ext4.h | 2 ++
fs/ext4/super.c | 1 +
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741..d9c4792 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -191,7 +191,7 @@ int ext4_setup_system_zone(struct super_block *sb)
if (!test_opt(sb, BLOCK_VALIDITY)) {
if (sbi->system_blks.rb_node)
- ext4_release_system_zone(sb);
+ ext4_release_system_zone_lock(sb);
return 0;
}
if (sbi->system_blks.rb_node)
@@ -239,6 +239,14 @@ void ext4_release_system_zone(struct super_block *sb)
EXT4_SB(sb)->system_blks = RB_ROOT;
}
+/* Called when (re)mounting the filesystem without BLOCK_VALIDITY */
+void ext4_release_system_zone_lock(struct super_block *sb)
+{
+ spin_lock(&EXT4_SB(sb)->system_blks_lock);
+ ext4_release_system_zone(sb);
+ spin_unlock(&EXT4_SB(sb)->system_blks_lock);
+}
+
/*
* Returns 1 if the passed-in block region (start_blk,
* start_blk+count) is valid; 0 if some part of the block region
@@ -248,7 +256,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *entry;
- struct rb_node *n = sbi->system_blks.rb_node;
+ struct rb_node *n;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count < start_blk) ||
@@ -256,6 +264,13 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
return 0;
}
+
+ /*
+ * Lock the system zone to prevent it being released concurrently
+ * when doing a remount with "noblock_validity" mount option.
+ */
+ spin_lock(&sbi->system_blks_lock);
+ n = sbi->system_blks.rb_node;
while (n) {
entry = rb_entry(n, struct ext4_system_zone, node);
if (start_blk + count - 1 < entry->start_blk)
@@ -264,9 +279,11 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
n = n->rb_right;
else {
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+ spin_unlock(&sbi->system_blks_lock);
return 0;
}
}
+ spin_unlock(&sbi->system_blks_lock);
return 1;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa..fd34a7b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1421,6 +1421,7 @@ struct ext4_sb_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+ spinlock_t system_blks_lock;
struct rb_root system_blks;
#ifdef EXTENTS_STATS
@@ -3191,6 +3192,7 @@ extern void ext4_exit_sysfs(void);
/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
+extern void ext4_release_system_zone_lock(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605..c95f972 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4490,6 +4490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_set_resv_clusters(sb);
+ spin_lock_init(&sbi->system_blks_lock);
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
--
2.7.4
On Wed 24-07-19 20:11:08, zhangyi (F) wrote:
> Remount process will release system zone which was allocated before if
> "noblock_validity" is specified. If we mount an ext4 file system to two
> mountpoints whit default mount options, and then remount one of them
> with "noblock_validity", it may trigger a use after free problem when
> someone accessing the other one.
>
> # mount /dev/sda foo
> # mount /dev/sda bar
>
> User access mountpoint "foo" | Remount mountpoint "bar"
> |
> ext4_map_blocks() | ext4_remount()
> check_block_validity() | ext4_setup_system_zone()
> ext4_data_block_valid() | ext4_release_system_zone()
> | free system_blks rb nodes
> access system_blks rb nodes |
> trigger use after free |
>
> This patch lock the system zone when accessing it to prevent it being
> released when doing a remount with "noblock_validity" mount option.
>
> Signed-off-by: zhangyi (F) <[email protected]>
> Cc: [email protected]
Thanks for the patch. It is a good catch. Some small comments below.
> diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
> index 8e83741..d9c4792 100644
> --- a/fs/ext4/block_validity.c
> +++ b/fs/ext4/block_validity.c
> @@ -191,7 +191,7 @@ int ext4_setup_system_zone(struct super_block *sb)
>
> if (!test_opt(sb, BLOCK_VALIDITY)) {
> if (sbi->system_blks.rb_node)
> - ext4_release_system_zone(sb);
> + ext4_release_system_zone_lock(sb);
> return 0;
> }
> if (sbi->system_blks.rb_node)
> @@ -239,6 +239,14 @@ void ext4_release_system_zone(struct super_block *sb)
> EXT4_SB(sb)->system_blks = RB_ROOT;
> }
>
> +/* Called when (re)mounting the filesystem without BLOCK_VALIDITY */
> +void ext4_release_system_zone_lock(struct super_block *sb)
> +{
> + spin_lock(&EXT4_SB(sb)->system_blks_lock);
> + ext4_release_system_zone(sb);
> + spin_unlock(&EXT4_SB(sb)->system_blks_lock);
> +}
Is there any reason why ext4_release_system_zone() should not always take
the system_blks_lock lock? I understand it may not be necessary in all the
cases but it won't hurt either...
Also ext4_setup_system_zone() should IMO use system_blks_lock to protect
modifications of the rbtree. It can get called during remount as well so
there can be racing ext4_data_block_valid() reading the rbtree at the same
time.
> @@ -256,6 +264,13 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
> sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
> return 0;
> }
> +
> + /*
> + * Lock the system zone to prevent it being released concurrently
> + * when doing a remount with "noblock_validity" mount option.
> + */
> + spin_lock(&sbi->system_blks_lock);
> + n = sbi->system_blks.rb_node;
> while (n) {
> entry = rb_entry(n, struct ext4_system_zone, node);
> if (start_blk + count - 1 < entry->start_blk)
> @@ -264,9 +279,11 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
> n = n->rb_right;
> else {
> sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
> + spin_unlock(&sbi->system_blks_lock);
> return 0;
> }
> }
> + spin_unlock(&sbi->system_blks_lock);
> return 1;
> }
So this will not only serialize ext4_data_block_valid() against remounts
but also against each other. So I suspect that a read-heavy workload on
fast storage could contend on your new fs-wide spinlock. So I think it
would be better to have some other synchronization scheme to avoid the
race.
If nothing else, rwlock_t would allow concurrent ext4_data_block_valid()
calls. It is still not ideal as the calls would be still bouncing around
the cacheline when updating the lock itself but better than nothing.
Ideal (performance-wise) would be to use RCU scheme for this -
ext4_data_block_valid() would be RCU protected when reading the RB-tree,
teardown of the block validity information would clear
sbi->system_blks.rb_node and then defer actual freeing of the tree nodes to
RCU callback. Setup would first construct the rbtree and then just set
sbi->system_blks.rb_node to the root of the constructed tree.
That being said I'm not *sure* this is going to be a performance issue
since ext4_map_blocks() are not that frequent and the lock hold times will
be very short (needs testing). So maybe rwlock_t is a reasonable compromise
between complexity and performance.
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
Thanks for your suggestions, I will look at the RCU method to solve this problem.
Thanks,
Yi.
On 2019/7/31 22:08, Jan Kara Wrote:
> On Wed 24-07-19 20:11:08, zhangyi (F) wrote:
>> Remount process will release system zone which was allocated before if
>> "noblock_validity" is specified. If we mount an ext4 file system to two
>> mountpoints whit default mount options, and then remount one of them
>> with "noblock_validity", it may trigger a use after free problem when
>> someone accessing the other one.
>>
>> # mount /dev/sda foo
>> # mount /dev/sda bar
>>
>> User access mountpoint "foo" | Remount mountpoint "bar"
>> |
>> ext4_map_blocks() | ext4_remount()
>> check_block_validity() | ext4_setup_system_zone()
>> ext4_data_block_valid() | ext4_release_system_zone()
>> | free system_blks rb nodes
>> access system_blks rb nodes |
>> trigger use after free |
>>
>> This patch lock the system zone when accessing it to prevent it being
>> released when doing a remount with "noblock_validity" mount option.
>>
>> Signed-off-by: zhangyi (F) <[email protected]>
>> Cc: [email protected]
>
> Thanks for the patch. It is a good catch. Some small comments below.
>
>> diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
>> index 8e83741..d9c4792 100644
>> --- a/fs/ext4/block_validity.c
>> +++ b/fs/ext4/block_validity.c
>> @@ -191,7 +191,7 @@ int ext4_setup_system_zone(struct super_block *sb)
>>
>> if (!test_opt(sb, BLOCK_VALIDITY)) {
>> if (sbi->system_blks.rb_node)
>> - ext4_release_system_zone(sb);
>> + ext4_release_system_zone_lock(sb);
>> return 0;
>> }
>> if (sbi->system_blks.rb_node)
>> @@ -239,6 +239,14 @@ void ext4_release_system_zone(struct super_block *sb)
>> EXT4_SB(sb)->system_blks = RB_ROOT;
>> }
>>
>> +/* Called when (re)mounting the filesystem without BLOCK_VALIDITY */
>> +void ext4_release_system_zone_lock(struct super_block *sb)
>> +{
>> + spin_lock(&EXT4_SB(sb)->system_blks_lock);
>> + ext4_release_system_zone(sb);
>> + spin_unlock(&EXT4_SB(sb)->system_blks_lock);
>> +}
>
> Is there any reason why ext4_release_system_zone() should not always take
> the system_blks_lock lock? I understand it may not be necessary in all the
> cases but it won't hurt either...
>
> Also ext4_setup_system_zone() should IMO use system_blks_lock to protect
> modifications of the rbtree. It can get called during remount as well so
> there can be racing ext4_data_block_valid() reading the rbtree at the same
> time.
>
>> @@ -256,6 +264,13 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
>> sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
>> return 0;
>> }
>> +
>> + /*
>> + * Lock the system zone to prevent it being released concurrently
>> + * when doing a remount with "noblock_validity" mount option.
>> + */
>> + spin_lock(&sbi->system_blks_lock);
>> + n = sbi->system_blks.rb_node;
>> while (n) {
>> entry = rb_entry(n, struct ext4_system_zone, node);
>> if (start_blk + count - 1 < entry->start_blk)
>> @@ -264,9 +279,11 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
>> n = n->rb_right;
>> else {
>> sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
>> + spin_unlock(&sbi->system_blks_lock);
>> return 0;
>> }
>> }
>> + spin_unlock(&sbi->system_blks_lock);
>> return 1;
>> }
>
> So this will not only serialize ext4_data_block_valid() against remounts
> but also against each other. So I suspect that a read-heavy workload on
> fast storage could contend on your new fs-wide spinlock. So I think it
> would be better to have some other synchronization scheme to avoid the
> race.
>
> If nothing else, rwlock_t would allow concurrent ext4_data_block_valid()
> calls. It is still not ideal as the calls would be still bouncing around
> the cacheline when updating the lock itself but better than nothing.
>
> Ideal (performance-wise) would be to use RCU scheme for this -
> ext4_data_block_valid() would be RCU protected when reading the RB-tree,
> teardown of the block validity information would clear
> sbi->system_blks.rb_node and then defer actual freeing of the tree nodes to
> RCU callback. Setup would first construct the rbtree and then just set
> sbi->system_blks.rb_node to the root of the constructed tree.
>
> That being said I'm not *sure* this is going to be a performance issue
> since ext4_map_blocks() are not that frequent and the lock hold times will
> be very short (needs testing). So maybe rwlock_t is a reasonable compromise
> between complexity and performance.
>
> Honza
>