Remount process will release system zone which was allocated before if
"noblock_validity" is specified. If we mount an ext4 file system to two
mountpoints whit default mount options, and then remount one of them
with "noblock_validity", it may trigger a use after free problem when
someone accessing the other one.
# mount /dev/sda foo
# mount /dev/sda bar
User access mountpoint "foo" | Remount mountpoint "bar"
|
ext4_map_blocks() | ext4_remount()
check_block_validity() | ext4_setup_system_zone()
ext4_data_block_valid() | ext4_release_system_zone()
| free system_blks rb nodes
access system_blks rb nodes |
trigger use after free |
At the same time, add_system_zone() can get called during remount as
well so there can be racing ext4_data_block_valid() reading the rbtree
at the same time.
This patch add RCU and seqlock to protect system zone from releasing or
building when doing a remount which inverse current "noblock_validity"
mount option.
Signed-off-by: zhangyi (F) <[email protected]>
Cc: [email protected]
---
Changes since v1:
- I use fio test my v1 patch, it has about 10% perfoamance regression
on my machine (CPU: Intel E5-2690 v3, 10G ramdisk), switch to use
seqlock and RCU to protect system zone instead of spinlock, and this
synchronization scheme seems not affect the perfoamance now.
fs/ext4/block_validity.c | 63 +++++++++++++++++++++++++++++++++++++++---------
fs/ext4/ext4.h | 1 +
fs/ext4/super.c | 1 +
3 files changed, 53 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8e83741..a510e4a 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -21,7 +21,10 @@
#include "ext4.h"
struct ext4_system_zone {
- struct rb_node node;
+ union {
+ struct rcu_head rcu;
+ struct rb_node node;
+ };
ext4_fsblk_t start_blk;
unsigned int count;
};
@@ -49,6 +52,12 @@ static inline int can_merge(struct ext4_system_zone *entry1,
return 0;
}
+static void destroy_system_zone(struct rcu_head *rcu)
+{
+ kmem_cache_free(ext4_system_zone_cachep,
+ container_of(rcu, struct ext4_system_zone, rcu));
+}
+
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
@@ -59,9 +68,12 @@ static int add_system_zone(struct ext4_sb_info *sbi,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
- struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node **n, *node;
struct rb_node *parent = NULL, *new_node = NULL;
+ write_seqlock(&sbi->system_blks_lock);
+
+ n = &sbi->system_blks.rb_node;
while (*n) {
parent = *n;
entry = rb_entry(parent, struct ext4_system_zone, node);
@@ -84,13 +96,15 @@ static int add_system_zone(struct ext4_sb_info *sbi,
if (!new_entry) {
new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
GFP_KERNEL);
- if (!new_entry)
+ if (!new_entry) {
+ write_sequnlock(&sbi->system_blks_lock);
return -ENOMEM;
+ }
new_entry->start_blk = start_blk;
new_entry->count = count;
new_node = &new_entry->node;
- rb_link_node(new_node, parent, n);
+ rb_link_node_rcu(new_node, parent, n);
rb_insert_color(new_node, &sbi->system_blks);
}
@@ -102,7 +116,7 @@ static int add_system_zone(struct ext4_sb_info *sbi,
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
- kmem_cache_free(ext4_system_zone_cachep, entry);
+ call_rcu(&entry->rcu, destroy_system_zone);
}
}
@@ -113,9 +127,12 @@ static int add_system_zone(struct ext4_sb_info *sbi,
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
rb_erase(node, &sbi->system_blks);
- kmem_cache_free(ext4_system_zone_cachep, entry);
+ call_rcu(&entry->rcu, destroy_system_zone);
}
}
+
+ write_sequnlock(&sbi->system_blks_lock);
+
return 0;
}
@@ -232,11 +249,11 @@ void ext4_release_system_zone(struct super_block *sb)
{
struct ext4_system_zone *entry, *n;
+ rcu_assign_pointer(EXT4_SB(sb)->system_blks.rb_node, NULL);
+
rbtree_postorder_for_each_entry_safe(entry, n,
&EXT4_SB(sb)->system_blks, node)
- kmem_cache_free(ext4_system_zone_cachep, entry);
-
- EXT4_SB(sb)->system_blks = RB_ROOT;
+ call_rcu(&entry->rcu, destroy_system_zone);
}
/*
@@ -248,7 +265,9 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *entry;
- struct rb_node *n = sbi->system_blks.rb_node;
+ struct rb_node *n;
+ unsigned seq = 0;
+ int ret = 1;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count < start_blk) ||
@@ -256,6 +275,17 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
return 0;
}
+
+ /*
+ * Lock the system zone to prevent it being released concurrently
+ * when doing a remount which inverse current "[no]block_validity"
+ * mount option.
+ */
+ rcu_read_lock();
+retry:
+ read_seqbegin_or_lock(&sbi->system_blks_lock, &seq);
+
+ n = rcu_dereference_raw(sbi->system_blks.rb_node);
while (n) {
entry = rb_entry(n, struct ext4_system_zone, node);
if (start_blk + count - 1 < entry->start_blk)
@@ -264,10 +294,19 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
n = n->rb_right;
else {
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
+ ret = 0;
+ break;
}
}
- return 1;
+ if (!(seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&sbi->system_blks_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry(&sbi->system_blks_lock, seq);
+
+ return ret;
}
int ext4_check_blockref(const char *function, unsigned int line,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa..bbf986f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1421,6 +1421,7 @@ struct ext4_sb_info {
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+ seqlock_t system_blks_lock;
struct rb_root system_blks;
#ifdef EXTENTS_STATS
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605..3082b1e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4490,6 +4490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_set_resv_clusters(sb);
+ seqlock_init(&sbi->system_blks_lock);
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
--
2.7.4
Hi "zhangyi,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linus/master]
[cannot apply to v5.3-rc3 next-20190807]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/zhangyi-F/ext4-fix-potential-use-after-free-in-system-zone-via-remount-with-noblock_validity/20190804-163619
reproduce:
# apt-get install sparse
# sparse version: v0.6.1-rc1-7-g2b96cd8-dirty
make ARCH=x86_64 allmodconfig
make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
sparse warnings: (new ones prefixed by >>)
include/linux/sched.h:609:43: sparse: sparse: bad integer constant expression
include/linux/sched.h:609:73: sparse: sparse: invalid named zero-width bitfield `value'
include/linux/sched.h:610:43: sparse: sparse: bad integer constant expression
include/linux/sched.h:610:67: sparse: sparse: invalid named zero-width bitfield `bucket_id'
include/linux/rbtree.h:84:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
include/linux/rbtree.h:84:9: sparse: struct rb_node [noderef] <asn:4> *
include/linux/rbtree.h:84:9: sparse: struct rb_node *
>> fs/ext4/block_validity.c:252:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
>> fs/ext4/block_validity.c:252:9: sparse: struct rb_node [noderef] <asn:4> *
>> fs/ext4/block_validity.c:252:9: sparse: struct rb_node *
vim +252 fs/ext4/block_validity.c
246
247 /* Called when the filesystem is unmounted */
248 void ext4_release_system_zone(struct super_block *sb)
249 {
250 struct ext4_system_zone *entry, *n;
251
> 252 rcu_assign_pointer(EXT4_SB(sb)->system_blks.rb_node, NULL);
253
254 rbtree_postorder_for_each_entry_safe(entry, n,
255 &EXT4_SB(sb)->system_blks, node)
256 call_rcu(&entry->rcu, destroy_system_zone);
257 }
258
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation