2019-09-03 00:01:23

by Jan Stancek

[permalink] [raw]
Subject: [PATCH] fat: fix corruption in fat_alloc_new_dir()

sb_getblk does not guarantee that buffer_head is uptodate. If there is
async read running in parallel for same buffer_head, it can overwrite
just initialized msdos_dir_entry, leading to corruption:
FAT-fs (loop0): error, corrupted directory (invalid entries)
FAT-fs (loop0): Filesystem has been set read-only

This can happen for example during LTP statx04, which creates loop
device, formats it (mkfs.vfat), mounts it and immediately creates
a new directory. In parallel, systemd-udevd is probing new block
device, which leads to async read.

do_mkdirat ksys_read
vfs_mkdir vfs_read
vfat_mkdir __vfs_read
fat_alloc_new_dir new_sync_read
/* init de[0], de[1] */ blkdev_read_iter
generic_file_read_iter
generic_file_buffered_read
blkdev_readpage
block_read_full_page

Faster reproducer (based on LTP statx04):
--------------------------------- 8< ---------------------------------
int main(void)
{
int i, j, ret, fd, loop_fd, ctrl_fd;
int loop_num;
char loopdev[256], tmp[256], testfile[256];

mkdir("/tmp/mntpoint", 0777);
for (i = 0; ; i++) {
printf("Iteration: %d\n", i);
sprintf(testfile, "/tmp/test.img.%d", getpid());

ctrl_fd = open("/dev/loop-control", O_RDWR);
loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
close(ctrl_fd);
sprintf(loopdev, "/dev/loop%d", loop_num);

fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
fallocate(fd, 0, 0, 256*1024*1024);
close(fd);

fd = open(testfile, O_RDWR);
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_SET_FD, fd);
close(loop_fd);
close(fd);

sprintf(tmp, "mkfs.vfat %s", loopdev);
system(tmp);
mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);

for (j = 0; j < 200; j++) {
sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
ret = mkdir(tmp, 0777);
if (ret) {
perror("mkdir");
break;
}
}

umount("/tmp/mntpoint");
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_CLR_FD, fd);
close(loop_fd);
unlink(testfile);

if (ret)
break;
}

return 0;
}
--------------------------------- 8< ---------------------------------

Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).

Signed-off-by: Jan Stancek <[email protected]>
Cc: OGAWA Hirofumi <[email protected]>
---
fs/fat/dir.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 1bda2ab6745b..474fd6873ec8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1149,7 +1149,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
goto error;

blknr = fat_clus_to_blknr(sbi, cluster);
- bhs[0] = sb_getblk(sb, blknr);
+ bhs[0] = sb_bread(sb, blknr);
if (!bhs[0]) {
err = -ENOMEM;
goto error_free;
--
1.8.3.1


2019-09-09 09:10:50

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()

Jan Stancek <[email protected]> writes:

> sb_getblk does not guarantee that buffer_head is uptodate. If there is
> async read running in parallel for same buffer_head, it can overwrite
> just initialized msdos_dir_entry, leading to corruption:
> FAT-fs (loop0): error, corrupted directory (invalid entries)
> FAT-fs (loop0): Filesystem has been set read-only
>
> This can happen for example during LTP statx04, which creates loop
> device, formats it (mkfs.vfat), mounts it and immediately creates
> a new directory. In parallel, systemd-udevd is probing new block
> device, which leads to async read.
>
> do_mkdirat ksys_read
> vfs_mkdir vfs_read
> vfat_mkdir __vfs_read
> fat_alloc_new_dir new_sync_read
> /* init de[0], de[1] */ blkdev_read_iter
> generic_file_read_iter
> generic_file_buffered_read
> blkdev_readpage
> block_read_full_page
>
> Faster reproducer (based on LTP statx04):
>
> int main(void)
> {
> int i, j, ret, fd, loop_fd, ctrl_fd;
> int loop_num;
> char loopdev[256], tmp[256], testfile[256];
>
> mkdir("/tmp/mntpoint", 0777);
> for (i = 0; ; i++) {
> printf("Iteration: %d\n", i);
> sprintf(testfile, "/tmp/test.img.%d", getpid());
>
> ctrl_fd = open("/dev/loop-control", O_RDWR);
> loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
> close(ctrl_fd);
> sprintf(loopdev, "/dev/loop%d", loop_num);
>
> fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
> fallocate(fd, 0, 0, 256*1024*1024);
> close(fd);
>
> fd = open(testfile, O_RDWR);
> loop_fd = open(loopdev, O_RDWR);
> ioctl(loop_fd, LOOP_SET_FD, fd);
> close(loop_fd);
> close(fd);
>
> sprintf(tmp, "mkfs.vfat %s", loopdev);
> system(tmp);
> mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);
>
> for (j = 0; j < 200; j++) {
> sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
> ret = mkdir(tmp, 0777);
> if (ret) {
> perror("mkdir");
> break;
> }
> }
>
> umount("/tmp/mntpoint");
> loop_fd = open(loopdev, O_RDWR);
> ioctl(loop_fd, LOOP_CLR_FD, fd);
> close(loop_fd);
> unlink(testfile);
>
> if (ret)
> break;
> }
>
> return 0;
> }
>
> Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).

Using the device while mounting same device doesn't work reliably like
this race. (getblk() is intentionally used to get the buffer to write
new data.)

mount(2) internally opens the device by EXCL mode, so I guess udev opens
without EXCL (I dont know if it is intent or not).

Thanks.
--
OGAWA Hirofumi <[email protected]>

2019-09-09 13:28:33

by Jan Stancek

[permalink] [raw]
Subject: Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()



----- Original Message -----
> Jan Stancek <[email protected]> writes:
>
> > sb_getblk does not guarantee that buffer_head is uptodate. If there is
> > async read running in parallel for same buffer_head, it can overwrite
> > just initialized msdos_dir_entry, leading to corruption:
> > FAT-fs (loop0): error, corrupted directory (invalid entries)
> > FAT-fs (loop0): Filesystem has been set read-only
> >
> > This can happen for example during LTP statx04, which creates loop
> > device, formats it (mkfs.vfat), mounts it and immediately creates
> > a new directory. In parallel, systemd-udevd is probing new block
> > device, which leads to async read.
> >
> > do_mkdirat ksys_read
> > vfs_mkdir vfs_read
> > vfat_mkdir __vfs_read
> > fat_alloc_new_dir new_sync_read
> > /* init de[0], de[1] */ blkdev_read_iter
> > generic_file_read_iter
> > generic_file_buffered_read
> > blkdev_readpage
> > block_read_full_page
> >
> > Faster reproducer (based on LTP statx04):
> >
> > int main(void)
> > {
> > int i, j, ret, fd, loop_fd, ctrl_fd;
> > int loop_num;
> > char loopdev[256], tmp[256], testfile[256];
> >
> > mkdir("/tmp/mntpoint", 0777);
> > for (i = 0; ; i++) {
> > printf("Iteration: %d\n", i);
> > sprintf(testfile, "/tmp/test.img.%d", getpid());
> >
> > ctrl_fd = open("/dev/loop-control", O_RDWR);
> > loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
> > close(ctrl_fd);
> > sprintf(loopdev, "/dev/loop%d", loop_num);
> >
> > fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
> > fallocate(fd, 0, 0, 256*1024*1024);
> > close(fd);
> >
> > fd = open(testfile, O_RDWR);
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_SET_FD, fd);
> > close(loop_fd);
> > close(fd);
> >
> > sprintf(tmp, "mkfs.vfat %s", loopdev);
> > system(tmp);
> > mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);
> >
> > for (j = 0; j < 200; j++) {
> > sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
> > ret = mkdir(tmp, 0777);
> > if (ret) {
> > perror("mkdir");
> > break;
> > }
> > }
> >
> > umount("/tmp/mntpoint");
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_CLR_FD, fd);
> > close(loop_fd);
> > unlink(testfile);
> >
> > if (ret)
> > break;
> > }
> >
> > return 0;
> > }
> >
> > Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).
>
> Using the device while mounting same device doesn't work reliably like
> this race. (getblk() is intentionally used to get the buffer to write
> new data.)

Are you saying this is expected even if 'usage' is just read?

>
> mount(2) internally opens the device by EXCL mode, so I guess udev opens
> without EXCL (I dont know if it is intent or not).

I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had trouble
booting, it was getting stuck on mounting LVM volumes.

So, I'm not sure how to move forward here.

Regards,
Jan

2019-09-10 18:48:20

by OGAWA Hirofumi

[permalink] [raw]
Subject: Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()

Jan Stancek <[email protected]> writes:

>> Using the device while mounting same device doesn't work reliably like
>> this race. (getblk() is intentionally used to get the buffer to write
>> new data.)
>
> Are you saying this is expected even if 'usage' is just read?

Yes, assuming exclusive access.

>> mount(2) internally opens the device by EXCL mode, so I guess udev opens
>> without EXCL (I dont know if it is intent or not).
>
> I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had trouble
> booting, it was getting stuck on mounting LVM volumes.
>
> So, I'm not sure how to move forward here.

OK. I'm still think the userspace should avoid to use blockdev while
mounting though, this patch will workaround this race with small race.

Can you test this?

Thanks.
--
OGAWA Hirofumi <[email protected]>


[PATCH] fat: Workaround the race with userspace's read via blockdev while mounting

If userspace reads the buffer via blockdev while mounting,
sb_getblk()+modify can race with buffer read via blockdev.

For example,

FS userspace
bh = sb_getblk()
modify bh->b_data
read
ll_rw_block(bh)
fill bh->b_data by on-disk data
/* lost modified data by FS */
set_buffer_uptodate(bh)
set_buffer_uptodate(bh)

The userspace should not use the blockdev while mounting though, the
udev seems to be already doing this. Although I think the udev should
try to avoid this, workaround the race by small overhead.

Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/dir.c | 13 +++++++++++--
fs/fat/fatent.c | 3 +++
2 files changed, 14 insertions(+), 2 deletions(-)

diff -puN fs/fat/dir.c~fat-workaround-getblk fs/fat/dir.c
--- linux/fs/fat/dir.c~fat-workaround-getblk 2019-09-10 09:29:51.137292020 +0900
+++ linux-hirofumi/fs/fat/dir.c 2019-09-10 09:39:15.366295152 +0900
@@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct ino
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memset(bhs[n]->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);

n++;
@@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir,
fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);

de = (struct msdos_dir_entry *)bhs[0]->b_data;
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[0]);
/* filling the new directory slots ("." and ".." entries) */
memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir,
de[0].size = de[1].size = 0;
memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
set_buffer_uptodate(bhs[0]);
+ unlock_buffer(bhs[0]);
mark_buffer_dirty_inode(bhs[0], dir);

err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct in

/* fill the directory entry */
copy = min(size, sb->s_blocksize);
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memcpy(bhs[n]->b_data, slots, copy);
- slots += copy;
- size -= copy;
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
+ slots += copy;
+ size -= copy;
if (!size)
break;
n++;
diff -puN fs/fat/fatent.c~fat-workaround-getblk fs/fat/fatent.c
--- linux/fs/fat/fatent.c~fat-workaround-getblk 2019-09-10 09:36:20.247225406 +0900
+++ linux-hirofumi/fs/fat/fatent.c 2019-09-10 09:36:43.847100048 +0900
@@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_b
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(c_bh);
memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
set_buffer_uptodate(c_bh);
+ unlock_buffer(c_bh);
mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
if (sb->s_flags & SB_SYNCHRONOUS)
err = sync_dirty_buffer(c_bh);
_

2019-09-10 19:18:27

by Jan Stancek

[permalink] [raw]
Subject: Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()



----- Original Message -----
> Jan Stancek <[email protected]> writes:
>
> >> Using the device while mounting same device doesn't work reliably like
> >> this race. (getblk() is intentionally used to get the buffer to write
> >> new data.)
> >
> > Are you saying this is expected even if 'usage' is just read?
>
> Yes, assuming exclusive access.

Seems we were lucky so far to only hit this with FAT.

I also tried couple variations of reproducer:

- Disabling udevd and running just "blkid --probe" in parallel
also reproduced it
- Disabling udevd and running read() on first 1024 sectors in parallel
also reproduced it
- aio_read() submitted prior to mount could reproduce it,
as long as fd was held open
- I couldn't reproduce it with fadvise/madvise WILLNEED submitted prior to mount

>
> >> mount(2) internally opens the device by EXCL mode, so I guess udev opens
> >> without EXCL (I dont know if it is intent or not).
> >
> > I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had
> > trouble
> > booting, it was getting stuck on mounting LVM volumes.
> >
> > So, I'm not sure how to move forward here.
>
> OK. I'm still think the userspace should avoid to use blockdev while
> mounting though, this patch will workaround this race with small race.

https://systemd.io/BLOCK_DEVICE_LOCKING.html mentions flock(LOCK_EX) as a way
to avoid probing while "another program concurrently modifies a superblock or
partition table". Adding flock(LOCK_EX) works around the problem too, but that
would address problem only for LTP (and tools/scripts that use this approach).

>
> Can you test this?
>
> Thanks.
> --
> OGAWA Hirofumi <[email protected]>
>
>
> [PATCH] fat: Workaround the race with userspace's read via blockdev while
> mounting

I ran reproducer on patched kernel for 5 hours, it made over 25000 iterations,
there was no corruption. Thank you for looking at this.

Regards,
Jan

2019-09-11 06:58:18

by OGAWA Hirofumi

[permalink] [raw]
Subject: [PATCH] fat: Workaround the race with userspace's read via blockdev while mounting

If userspace reads the buffer via blockdev while mounting,
sb_getblk()+modify can race with buffer read via blockdev.

For example,

FS userspace
bh = sb_getblk()
modify bh->b_data
read
ll_rw_block(bh)
fill bh->b_data by on-disk data
/* lost modified data by FS */
set_buffer_uptodate(bh)
set_buffer_uptodate(bh)

The userspace should not use the blockdev while mounting though, the
udev seems to be already doing this. Although I think the udev should
try to avoid this, workaround the race by small overhead.

Reported-by: Jan Stancek <[email protected]>
Tested-by: Jan Stancek <[email protected]>
Signed-off-by: OGAWA Hirofumi <[email protected]>
---

fs/fat/dir.c | 13 +++++++++++--
fs/fat/fatent.c | 3 +++
2 files changed, 14 insertions(+), 2 deletions(-)

diff -puN fs/fat/dir.c~fat-workaround-getblk fs/fat/dir.c
--- linux/fs/fat/dir.c~fat-workaround-getblk 2019-09-10 09:29:51.137292020 +0900
+++ linux-hirofumi/fs/fat/dir.c 2019-09-10 09:39:15.366295152 +0900
@@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct ino
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memset(bhs[n]->b_data, 0, sb->s_blocksize);
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);

n++;
@@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir,
fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);

de = (struct msdos_dir_entry *)bhs[0]->b_data;
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[0]);
/* filling the new directory slots ("." and ".." entries) */
memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir,
de[0].size = de[1].size = 0;
memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
set_buffer_uptodate(bhs[0]);
+ unlock_buffer(bhs[0]);
mark_buffer_dirty_inode(bhs[0], dir);

err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct in

/* fill the directory entry */
copy = min(size, sb->s_blocksize);
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(bhs[n]);
memcpy(bhs[n]->b_data, slots, copy);
- slots += copy;
- size -= copy;
set_buffer_uptodate(bhs[n]);
+ unlock_buffer(bhs[n]);
mark_buffer_dirty_inode(bhs[n], dir);
+ slots += copy;
+ size -= copy;
if (!size)
break;
n++;
diff -puN fs/fat/fatent.c~fat-workaround-getblk fs/fat/fatent.c
--- linux/fs/fat/fatent.c~fat-workaround-getblk 2019-09-10 09:36:20.247225406 +0900
+++ linux-hirofumi/fs/fat/fatent.c 2019-09-10 09:36:43.847100048 +0900
@@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_b
err = -ENOMEM;
goto error;
}
+ /* Avoid race with userspace read via bdev */
+ lock_buffer(c_bh);
memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
set_buffer_uptodate(c_bh);
+ unlock_buffer(c_bh);
mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
if (sb->s_flags & SB_SYNCHRONOUS)
err = sync_dirty_buffer(c_bh);
_