From: Daeho Jeong <[email protected]>
Added a new ioctl to send discard commands or/and zero out
to whole data area of a regular file for security reason.
Signed-off-by: Daeho Jeong <[email protected]>
---
fs/f2fs/f2fs.h | 8 +++
fs/f2fs/file.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 151 insertions(+)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c812fb8e2d9c..ca139fac5a73 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -434,6 +434,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
_IOR(F2FS_IOCTL_MAGIC, 18, __u64)
#define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
_IOR(F2FS_IOCTL_MAGIC, 19, __u64)
+#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, __u32)
#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
@@ -453,6 +454,13 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
+/*
+ * Flags used by F2FS_IOC_SEC_TRIM_FILE
+ */
+#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
+#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
+#define F2FS_TRIM_FILE_MASK 0x3
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index dfa1ac2d751a..ba9b7ec5d6bf 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3749,6 +3749,146 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
return ret;
}
+static int f2fs_secure_erase(struct block_device *bdev, block_t block,
+ block_t len, u32 flags)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ sector_t sector = SECTOR_FROM_BLOCK(block);
+ sector_t nr_sects = SECTOR_FROM_BLOCK(len);
+ int ret = 0;
+
+ if (!q)
+ return -ENXIO;
+
+ if (flags & F2FS_TRIM_FILE_DISCARD)
+ ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
+ blk_queue_secure_erase(q) ?
+ BLKDEV_DISCARD_SECURE : 0);
+
+ if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT))
+ ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0);
+
+ return ret;
+}
+
+static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *prev_bdev = NULL;
+ pgoff_t index, pg_start = 0, pg_end;
+ block_t prev_block = 0, len = 0;
+ u32 flags;
+ int ret = 0;
+
+ if (!(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (get_user(flags, (u32 __user *)arg))
+ return -EFAULT;
+ if (flags == 0 || (flags & ~F2FS_TRIM_FILE_MASK))
+ return -EINVAL;
+
+ if ((flags & F2FS_TRIM_FILE_DISCARD) && !f2fs_hw_support_discard(sbi))
+ return -EOPNOTSUPP;
+
+ file_start_write(filp);
+ inode_lock(inode);
+
+ if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode) ||
+ f2fs_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (!inode->i_size)
+ goto err;
+ pg_end = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ goto err;
+
+ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ down_write(&F2FS_I(inode)->i_mmap_sem);
+
+ ret = filemap_write_and_wait(mapping);
+ if (ret)
+ goto out;
+
+ truncate_inode_pages(mapping, 0);
+
+ for (index = pg_start; index < pg_end;) {
+ struct dnode_of_data dn;
+ unsigned int end_offset;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (ret)
+ goto out;
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ if (pg_end < end_offset + index)
+ end_offset = pg_end - index;
+
+ for (; dn.ofs_in_node < end_offset;
+ dn.ofs_in_node++, index++) {
+ struct block_device *cur_bdev;
+ block_t blkaddr = f2fs_data_blkaddr(&dn);
+
+ if (__is_valid_data_blkaddr(blkaddr)) {
+ if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
+ blkaddr, DATA_GENERIC_ENHANCE)) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+ } else
+ continue;
+
+ cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
+ if (f2fs_is_multi_device(sbi)) {
+ int i = f2fs_target_device_index(sbi, blkaddr);
+
+ blkaddr -= FDEV(i).start_blk;
+ }
+
+ if (len) {
+ if (prev_bdev == cur_bdev &&
+ blkaddr == prev_block + len) {
+ len++;
+ } else {
+ ret = f2fs_secure_erase(prev_bdev,
+ prev_block, len, flags);
+ if (ret)
+ goto out;
+
+ len = 0;
+ }
+ }
+
+ if (!len) {
+ prev_bdev = cur_bdev;
+ prev_block = blkaddr;
+ len = 1;
+ }
+ }
+
+ f2fs_put_dnode(&dn);
+ }
+
+ if (len)
+ ret = f2fs_secure_erase(prev_bdev, prev_block, len, flags);
+out:
+ up_write(&F2FS_I(inode)->i_mmap_sem);
+ up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+err:
+ inode_unlock(inode);
+ file_end_write(filp);
+
+ return ret;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3835,6 +3975,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_release_compress_blocks(filp, arg);
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
return f2fs_reserve_compress_blocks(filp, arg);
+ case F2FS_IOC_SEC_TRIM_FILE:
+ return f2fs_sec_trim_file(filp, arg);
default:
return -ENOTTY;
}
@@ -4004,6 +4146,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_GET_COMPRESS_BLOCKS:
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
+ case F2FS_IOC_SEC_TRIM_FILE:
break;
default:
return -ENOIOCTLCMD;
--
2.27.0.278.ge193c7cf3a9-goog
On 2020/6/11 11:16, Daeho Jeong wrote:
> From: Daeho Jeong <[email protected]>
>
> Added a new ioctl to send discard commands or/and zero out
> to whole data area of a regular file for security reason.
>
> Signed-off-by: Daeho Jeong <[email protected]>
> ---
> fs/f2fs/f2fs.h | 8 +++
> fs/f2fs/file.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 151 insertions(+)
>
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c812fb8e2d9c..ca139fac5a73 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -434,6 +434,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
> _IOR(F2FS_IOCTL_MAGIC, 18, __u64)
> #define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
> _IOR(F2FS_IOCTL_MAGIC, 19, __u64)
> +#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, __u32)
>
> #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
> #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
> @@ -453,6 +454,13 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
> #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
> #define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
>
> +/*
> + * Flags used by F2FS_IOC_SEC_TRIM_FILE
> + */
> +#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
> +#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
> +#define F2FS_TRIM_FILE_MASK 0x3
> +
> #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
> /*
> * ioctl commands in 32 bit emulation
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index dfa1ac2d751a..ba9b7ec5d6bf 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -3749,6 +3749,146 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
> return ret;
> }
>
> +static int f2fs_secure_erase(struct block_device *bdev, block_t block,
> + block_t len, u32 flags)
> +{
> + struct request_queue *q = bdev_get_queue(bdev);
> + sector_t sector = SECTOR_FROM_BLOCK(block);
> + sector_t nr_sects = SECTOR_FROM_BLOCK(len);
> + int ret = 0;
> +
> + if (!q)
> + return -ENXIO;
> +
> + if (flags & F2FS_TRIM_FILE_DISCARD)
> + ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
> + blk_queue_secure_erase(q) ?
> + BLKDEV_DISCARD_SECURE : 0);
> +
> + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT))
> + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0);
> +
> + return ret;
> +}
> +
> +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
> +{
> + struct inode *inode = file_inode(filp);
> + struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> + struct address_space *mapping = inode->i_mapping;
> + struct block_device *prev_bdev = NULL;
> + pgoff_t index, pg_start = 0, pg_end;
> + block_t prev_block = 0, len = 0;
> + u32 flags;
> + int ret = 0;
> +
> + if (!(filp->f_mode & FMODE_WRITE))
> + return -EBADF;
> +
> + if (get_user(flags, (u32 __user *)arg))
> + return -EFAULT;
> + if (flags == 0 || (flags & ~F2FS_TRIM_FILE_MASK))
> + return -EINVAL;
> +
> + if ((flags & F2FS_TRIM_FILE_DISCARD) && !f2fs_hw_support_discard(sbi))
> + return -EOPNOTSUPP;
> +
> + file_start_write(filp);
Now, I'm a little confused about when we need to call __mnt_want_write_file(),
you know, vfs_write() still will call this function when updating time.
- __generic_file_write_iter
- file_update_time
- __mnt_want_write_file
And previously, f2fs ioctl uses mnt_{want,drop}_write_file() whenever there is
any updates on fs/file, if Eric is correct, we need to clean up most of ioctl
interface as well.
> + inode_lock(inode);
> +
> + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode) ||
> + f2fs_compressed_file(inode)) {
> + ret = -EINVAL;
> + goto err;
> + }
> +
> + if (!inode->i_size)
> + goto err;
> + pg_end = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
> +
> + ret = f2fs_convert_inline_inode(inode);
> + if (ret)
> + goto err;
> +
> + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
> + down_write(&F2FS_I(inode)->i_mmap_sem);
> +
> + ret = filemap_write_and_wait(mapping);
> + if (ret)
> + goto out;
> +
> + truncate_inode_pages(mapping, 0);
> +
> + for (index = pg_start; index < pg_end;) {
> + struct dnode_of_data dn;
> + unsigned int end_offset;
> +
> + set_new_dnode(&dn, inode, NULL, NULL, 0);
> + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> + if (ret)
if (ret == -ENOENT) {
index = f2fs_get_next_page_offset(&dn, index);
continue;
}
> + goto out;
> +
> + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> + if (pg_end < end_offset + index)
> + end_offset = pg_end - index;
> +
> + for (; dn.ofs_in_node < end_offset;
> + dn.ofs_in_node++, index++) {
> + struct block_device *cur_bdev;
> + block_t blkaddr = f2fs_data_blkaddr(&dn);
> +
> + if (__is_valid_data_blkaddr(blkaddr)) {
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> + blkaddr, DATA_GENERIC_ENHANCE)) {
> + ret = -EFSCORRUPTED;
> + goto out;
if we goto out label here, we will miss to call f2fs_put_dnode()?
> + }
> + } else
How about this?
if (!__is_valid_data_blkaddr())
continue;
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), blkaddr, DATA_GENERIC_ENHANCE)) {
ret = -EFSCORRUPTED;
goto out;
}
> + continue;
> +
> + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> + if (f2fs_is_multi_device(sbi)) {
> + int i = f2fs_target_device_index(sbi, blkaddr);
> +
> + blkaddr -= FDEV(i).start_blk;
> + }
> +
> + if (len) {
> + if (prev_bdev == cur_bdev &&
> + blkaddr == prev_block + len) {
> + len++;
> + } else {
> + ret = f2fs_secure_erase(prev_bdev,
> + prev_block, len, flags);
> + if (ret)
> + goto out;
Ditto,
Thanks,
> +
> + len = 0;
> + }
> + }
> +
> + if (!len) {
> + prev_bdev = cur_bdev;
> + prev_block = blkaddr;
> + len = 1;
> + }
> + }
> +
> + f2fs_put_dnode(&dn);
> + }
> +
> + if (len)
> + ret = f2fs_secure_erase(prev_bdev, prev_block, len, flags);
> +out:
> + up_write(&F2FS_I(inode)->i_mmap_sem);
> + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
> +err:
> + inode_unlock(inode);
> + file_end_write(filp);
> +
> + return ret;
> +}
> +
> long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> {
> if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
> @@ -3835,6 +3975,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> return f2fs_release_compress_blocks(filp, arg);
> case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
> return f2fs_reserve_compress_blocks(filp, arg);
> + case F2FS_IOC_SEC_TRIM_FILE:
> + return f2fs_sec_trim_file(filp, arg);
> default:
> return -ENOTTY;
> }
> @@ -4004,6 +4146,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> case F2FS_IOC_GET_COMPRESS_BLOCKS:
> case F2FS_IOC_RELEASE_COMPRESS_BLOCKS:
> case F2FS_IOC_RESERVE_COMPRESS_BLOCKS:
> + case F2FS_IOC_SEC_TRIM_FILE:
> break;
> default:
> return -ENOIOCTLCMD;
>
2020년 6월 11일 (목) 오후 5:56, Chao Yu <[email protected]>님이 작성:
>
> On 2020/6/11 11:16, Daeho Jeong wrote:
> > From: Daeho Jeong <[email protected]>
> >
> > Added a new ioctl to send discard commands or/and zero out
> > to whole data area of a regular file for security reason.
> >
> > Signed-off-by: Daeho Jeong <[email protected]>
> > ---
> > fs/f2fs/f2fs.h | 8 +++
> > fs/f2fs/file.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 151 insertions(+)
> >
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index c812fb8e2d9c..ca139fac5a73 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -434,6 +434,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
> > _IOR(F2FS_IOCTL_MAGIC, 18, __u64)
> > #define F2FS_IOC_RESERVE_COMPRESS_BLOCKS \
> > _IOR(F2FS_IOCTL_MAGIC, 19, __u64)
> > +#define F2FS_IOC_SEC_TRIM_FILE _IOW(F2FS_IOCTL_MAGIC, 20, __u32)
> >
> > #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
> > #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
> > @@ -453,6 +454,13 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
> > #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
> > #define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */
> >
> > +/*
> > + * Flags used by F2FS_IOC_SEC_TRIM_FILE
> > + */
> > +#define F2FS_TRIM_FILE_DISCARD 0x1 /* send discard command */
> > +#define F2FS_TRIM_FILE_ZEROOUT 0x2 /* zero out */
> > +#define F2FS_TRIM_FILE_MASK 0x3
> > +
> > #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
> > /*
> > * ioctl commands in 32 bit emulation
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index dfa1ac2d751a..ba9b7ec5d6bf 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -3749,6 +3749,146 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
> > return ret;
> > }
> >
> > +static int f2fs_secure_erase(struct block_device *bdev, block_t block,
> > + block_t len, u32 flags)
> > +{
> > + struct request_queue *q = bdev_get_queue(bdev);
> > + sector_t sector = SECTOR_FROM_BLOCK(block);
> > + sector_t nr_sects = SECTOR_FROM_BLOCK(len);
> > + int ret = 0;
> > +
> > + if (!q)
> > + return -ENXIO;
> > +
> > + if (flags & F2FS_TRIM_FILE_DISCARD)
> > + ret = blkdev_issue_discard(bdev, sector, nr_sects, GFP_NOFS,
> > + blk_queue_secure_erase(q) ?
> > + BLKDEV_DISCARD_SECURE : 0);
> > +
> > + if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT))
> > + ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0);
> > +
> > + return ret;
> > +}
> > +
> > +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
> > +{
> > + struct inode *inode = file_inode(filp);
> > + struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > + struct address_space *mapping = inode->i_mapping;
> > + struct block_device *prev_bdev = NULL;
> > + pgoff_t index, pg_start = 0, pg_end;
> > + block_t prev_block = 0, len = 0;
> > + u32 flags;
> > + int ret = 0;
> > +
> > + if (!(filp->f_mode & FMODE_WRITE))
> > + return -EBADF;
> > +
> > + if (get_user(flags, (u32 __user *)arg))
> > + return -EFAULT;
> > + if (flags == 0 || (flags & ~F2FS_TRIM_FILE_MASK))
> > + return -EINVAL;
> > +
> > + if ((flags & F2FS_TRIM_FILE_DISCARD) && !f2fs_hw_support_discard(sbi))
> > + return -EOPNOTSUPP;
> > +
> > + file_start_write(filp);
>
> Now, I'm a little confused about when we need to call __mnt_want_write_file(),
> you know, vfs_write() still will call this function when updating time.
> - __generic_file_write_iter
> - file_update_time
> - __mnt_want_write_file
>
> And previously, f2fs ioctl uses mnt_{want,drop}_write_file() whenever there is
> any updates on fs/file, if Eric is correct, we need to clean up most of ioctl
> interface as well.
I also saw most filesytem codes use just mnt_{want,drop}_write_file()
and actually it doesn't affect code working. It's a matter of doing a
redundant job or not.
AFAIUI, if the file is not open for writing (FMODE_WRITE), we have to
call mnt_want_write_file() to increase mnt_writers.
In this case, we already checked it has FMODE_WRITE flag.
>
> > + inode_lock(inode);
> > +
> > + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode) ||
> > + f2fs_compressed_file(inode)) {
> > + ret = -EINVAL;
> > + goto err;
> > + }
> > +
> > + if (!inode->i_size)
> > + goto err;
> > + pg_end = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
> > +
> > + ret = f2fs_convert_inline_inode(inode);
> > + if (ret)
> > + goto err;
> > +
> > + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
> > + down_write(&F2FS_I(inode)->i_mmap_sem);
> > +
> > + ret = filemap_write_and_wait(mapping);
> > + if (ret)
> > + goto out;
> > +
> > + truncate_inode_pages(mapping, 0);
> > +
> > + for (index = pg_start; index < pg_end;) {
> > + struct dnode_of_data dn;
> > + unsigned int end_offset;
> > +
> > + set_new_dnode(&dn, inode, NULL, NULL, 0);
> > + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> > + if (ret)
>
> if (ret == -ENOENT) {
> index = f2fs_get_next_page_offset(&dn, index);
> continue;
> }
Got it.
>
> > + goto out;
> > +
> > + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> > + if (pg_end < end_offset + index)
> > + end_offset = pg_end - index;
> > +
> > + for (; dn.ofs_in_node < end_offset;
> > + dn.ofs_in_node++, index++) {
> > + struct block_device *cur_bdev;
> > + block_t blkaddr = f2fs_data_blkaddr(&dn);
> > +
> > + if (__is_valid_data_blkaddr(blkaddr)) {
> > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> > + blkaddr, DATA_GENERIC_ENHANCE)) {
> > + ret = -EFSCORRUPTED;
> > + goto out;
>
>
> if we goto out label here, we will miss to call f2fs_put_dnode()?
>
Oops, I missed this, when I changed the code flow. Thanks!
> > + }
> > + } else
>
> How about this?
>
> if (!__is_valid_data_blkaddr())
> continue;
>
> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), blkaddr, DATA_GENERIC_ENHANCE)) {
> ret = -EFSCORRUPTED;
> goto out;
> }
>
Looks better.
> > + continue;
> > +
> > + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> > + if (f2fs_is_multi_device(sbi)) {
> > + int i = f2fs_target_device_index(sbi, blkaddr);
> > +
> > + blkaddr -= FDEV(i).start_blk;
> > + }
> > +
> > + if (len) {
> > + if (prev_bdev == cur_bdev &&
> > + blkaddr == prev_block + len) {
> > + len++;
> > + } else {
> > + ret = f2fs_secure_erase(prev_bdev,
> > + prev_block, len, flags);
> > + if (ret)
> > + goto out;
>
> Ditto,
Got it.
On Thu, Jun 11, 2020 at 08:04:06PM +0900, Daeho Jeong wrote:
> > > +static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
> > > +{
> > > + struct inode *inode = file_inode(filp);
> > > + struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
> > > + struct address_space *mapping = inode->i_mapping;
> > > + struct block_device *prev_bdev = NULL;
> > > + pgoff_t index, pg_start = 0, pg_end;
> > > + block_t prev_block = 0, len = 0;
> > > + u32 flags;
> > > + int ret = 0;
> > > +
> > > + if (!(filp->f_mode & FMODE_WRITE))
> > > + return -EBADF;
> > > +
> > > + if (get_user(flags, (u32 __user *)arg))
> > > + return -EFAULT;
> > > + if (flags == 0 || (flags & ~F2FS_TRIM_FILE_MASK))
> > > + return -EINVAL;
> > > +
> > > + if ((flags & F2FS_TRIM_FILE_DISCARD) && !f2fs_hw_support_discard(sbi))
> > > + return -EOPNOTSUPP;
> > > +
> > > + file_start_write(filp);
> >
> > Now, I'm a little confused about when we need to call __mnt_want_write_file(),
> > you know, vfs_write() still will call this function when updating time.
> > - __generic_file_write_iter
> > - file_update_time
> > - __mnt_want_write_file
> >
> > And previously, f2fs ioctl uses mnt_{want,drop}_write_file() whenever there is
> > any updates on fs/file, if Eric is correct, we need to clean up most of ioctl
> > interface as well.
>
> I also saw most filesytem codes use just mnt_{want,drop}_write_file()
> and actually it doesn't affect code working. It's a matter of doing a
> redundant job or not.
> AFAIUI, if the file is not open for writing (FMODE_WRITE), we have to
> call mnt_want_write_file() to increase mnt_writers.
> In this case, we already checked it has FMODE_WRITE flag.
If the fd isn't writable (or may not be writable), mnt_want_write_file() is
needed. That includes all ioctls that operate (or may operate) on directories,
since directories can't be opened for writing.
But when the fd is guaranteed to be writable, incrementing mnt_writers is
pointless. I'm trying to clean this up in the VFS:
https://lkml.kernel.org/r/[email protected].
mnt_want_write_file() still does the freeze protection, which file_start_write()
achieves more directly.
The only other thing that mnt_want_write_file() does is the check for emergency
remount r/o, which I doubt is very important. It's racy, so the filesystem
needs to detect it in other places too.
I'm not sure why file_update_time() uses __mnt_want_write_file(). Either it
assumes the fd might not be writable, or it just wants the check for emergency
remount r/o, or it's just a mistake. Note also that mtime isn't always updated,
so just because file_update_time() calls __mnt_want_write_file() doesn't mean
that write() always calls __mnt_want_write_file().
- Eric
On Thu, Jun 11, 2020 at 12:16:52PM +0900, Daeho Jeong wrote:
> + for (index = pg_start; index < pg_end;) {
> + struct dnode_of_data dn;
> + unsigned int end_offset;
> +
> + set_new_dnode(&dn, inode, NULL, NULL, 0);
> + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> + if (ret)
> + goto out;
> +
> + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> + if (pg_end < end_offset + index)
> + end_offset = pg_end - index;
> +
> + for (; dn.ofs_in_node < end_offset;
> + dn.ofs_in_node++, index++) {
> + struct block_device *cur_bdev;
> + block_t blkaddr = f2fs_data_blkaddr(&dn);
> +
> + if (__is_valid_data_blkaddr(blkaddr)) {
> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> + blkaddr, DATA_GENERIC_ENHANCE)) {
> + ret = -EFSCORRUPTED;
> + goto out;
> + }
> + } else
> + continue;
> +
> + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> + if (f2fs_is_multi_device(sbi)) {
> + int i = f2fs_target_device_index(sbi, blkaddr);
> +
> + blkaddr -= FDEV(i).start_blk;
> + }
> +
> + if (len) {
> + if (prev_bdev == cur_bdev &&
> + blkaddr == prev_block + len) {
> + len++;
> + } else {
> + ret = f2fs_secure_erase(prev_bdev,
> + prev_block, len, flags);
> + if (ret)
> + goto out;
> +
> + len = 0;
> + }
> + }
> +
> + if (!len) {
> + prev_bdev = cur_bdev;
> + prev_block = blkaddr;
> + len = 1;
> + }
> + }
> +
> + f2fs_put_dnode(&dn);
> + }
This loop processes the entire file, which may be very large. So it could take
a very long time to execute.
It should at least use the following to make the task killable and preemptible:
if (fatal_signal_pending(current)) {
err = -EINTR;
goto out;
}
cond_resched();
Also, perhaps this ioctl should be made incremental, i.e. take in an
(offset, length) like pwrite()?
- Eric
2020년 6월 12일 (금) 오전 1:27, Eric Biggers <[email protected]>님이 작성:
>
> On Thu, Jun 11, 2020 at 12:16:52PM +0900, Daeho Jeong wrote:
> > + for (index = pg_start; index < pg_end;) {
> > + struct dnode_of_data dn;
> > + unsigned int end_offset;
> > +
> > + set_new_dnode(&dn, inode, NULL, NULL, 0);
> > + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> > + if (ret)
> > + goto out;
> > +
> > + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> > + if (pg_end < end_offset + index)
> > + end_offset = pg_end - index;
> > +
> > + for (; dn.ofs_in_node < end_offset;
> > + dn.ofs_in_node++, index++) {
> > + struct block_device *cur_bdev;
> > + block_t blkaddr = f2fs_data_blkaddr(&dn);
> > +
> > + if (__is_valid_data_blkaddr(blkaddr)) {
> > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> > + blkaddr, DATA_GENERIC_ENHANCE)) {
> > + ret = -EFSCORRUPTED;
> > + goto out;
> > + }
> > + } else
> > + continue;
> > +
> > + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> > + if (f2fs_is_multi_device(sbi)) {
> > + int i = f2fs_target_device_index(sbi, blkaddr);
> > +
> > + blkaddr -= FDEV(i).start_blk;
> > + }
> > +
> > + if (len) {
> > + if (prev_bdev == cur_bdev &&
> > + blkaddr == prev_block + len) {
> > + len++;
> > + } else {
> > + ret = f2fs_secure_erase(prev_bdev,
> > + prev_block, len, flags);
> > + if (ret)
> > + goto out;
> > +
> > + len = 0;
> > + }
> > + }
> > +
> > + if (!len) {
> > + prev_bdev = cur_bdev;
> > + prev_block = blkaddr;
> > + len = 1;
> > + }
> > + }
> > +
> > + f2fs_put_dnode(&dn);
> > + }
>
> This loop processes the entire file, which may be very large. So it could take
> a very long time to execute.
>
> It should at least use the following to make the task killable and preemptible:
>
> if (fatal_signal_pending(current)) {
> err = -EINTR;
> goto out;
> }
> cond_resched();
>
Good point!
> Also, perhaps this ioctl should be made incremental, i.e. take in an
> (offset, length) like pwrite()?
>
> - Eric
Discard and Zeroing will be treated in a unit of block, which is 4KB
in F2FS case.
Do you really need the (offset, length) option here even if the unit
is a 4KB block? I guess this option might make the user even
inconvenienced to use this ioctl, because they have to bear 4KB
alignment in mind.
On Fri, Jun 12, 2020 at 07:49:12AM +0900, Daeho Jeong wrote:
> 2020년 6월 12일 (금) 오전 1:27, Eric Biggers <[email protected]>님이 작성:
> >
> > On Thu, Jun 11, 2020 at 12:16:52PM +0900, Daeho Jeong wrote:
> > > + for (index = pg_start; index < pg_end;) {
> > > + struct dnode_of_data dn;
> > > + unsigned int end_offset;
> > > +
> > > + set_new_dnode(&dn, inode, NULL, NULL, 0);
> > > + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> > > + if (ret)
> > > + goto out;
> > > +
> > > + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> > > + if (pg_end < end_offset + index)
> > > + end_offset = pg_end - index;
> > > +
> > > + for (; dn.ofs_in_node < end_offset;
> > > + dn.ofs_in_node++, index++) {
> > > + struct block_device *cur_bdev;
> > > + block_t blkaddr = f2fs_data_blkaddr(&dn);
> > > +
> > > + if (__is_valid_data_blkaddr(blkaddr)) {
> > > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> > > + blkaddr, DATA_GENERIC_ENHANCE)) {
> > > + ret = -EFSCORRUPTED;
> > > + goto out;
> > > + }
> > > + } else
> > > + continue;
> > > +
> > > + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> > > + if (f2fs_is_multi_device(sbi)) {
> > > + int i = f2fs_target_device_index(sbi, blkaddr);
> > > +
> > > + blkaddr -= FDEV(i).start_blk;
> > > + }
> > > +
> > > + if (len) {
> > > + if (prev_bdev == cur_bdev &&
> > > + blkaddr == prev_block + len) {
> > > + len++;
> > > + } else {
> > > + ret = f2fs_secure_erase(prev_bdev,
> > > + prev_block, len, flags);
> > > + if (ret)
> > > + goto out;
> > > +
> > > + len = 0;
> > > + }
> > > + }
> > > +
> > > + if (!len) {
> > > + prev_bdev = cur_bdev;
> > > + prev_block = blkaddr;
> > > + len = 1;
> > > + }
> > > + }
> > > +
> > > + f2fs_put_dnode(&dn);
> > > + }
> >
> > This loop processes the entire file, which may be very large. So it could take
> > a very long time to execute.
> >
> > It should at least use the following to make the task killable and preemptible:
> >
> > if (fatal_signal_pending(current)) {
> > err = -EINTR;
> > goto out;
> > }
> > cond_resched();
> >
>
> Good point!
>
> > Also, perhaps this ioctl should be made incremental, i.e. take in an
> > (offset, length) like pwrite()?
> >
> > - Eric
>
> Discard and Zeroing will be treated in a unit of block, which is 4KB
> in F2FS case.
> Do you really need the (offset, length) option here even if the unit
> is a 4KB block? I guess this option might make the user even
> inconvenienced to use this ioctl, because they have to bear 4KB
> alignment in mind.
The ioctl as currently proposed always erases the entire file, which could be
gigabytes. That could take a very long time.
I'm suggesting considering making it possible to call the ioctl multiple times
to process the file incrementally, like you would do with write() or pwrite().
That implies that for each ioctl call, the length would need to be specified
unless it's hardcoded to 4KiB which would be very inefficient. Users would
probably want to process a larger amount at a time, like 1 MiB, right?
Likewise the offset would need to be specified as well, unless it were to be
taken implicitly from f_pos.
- Eric
For the incremental way of erasing, we might as well support the
(offset, length) option in a unit of 4KiB.
So, you might use this ioctl like the below. Does it work for you?
struct f2fs_sec_trim {
u64 startblk;
u64 blklen;
u32 flags;
};
sectrim.startblk = 0;
sectrim.blklen = 256; // 1MiB
sectrim.flags = F2FS_TRIM_FILE_DISCARD | F2FS_TRIM_FILE_ZEROOUT;
ret = ioctl(fd, F2FS_SEC_TRIM_FILE, §rim);
2020년 6월 12일 (금) 오전 8:00, Eric Biggers <[email protected]>님이 작성:
>
> On Fri, Jun 12, 2020 at 07:49:12AM +0900, Daeho Jeong wrote:
> > 2020년 6월 12일 (금) 오전 1:27, Eric Biggers <[email protected]>님이 작성:
> > >
> > > On Thu, Jun 11, 2020 at 12:16:52PM +0900, Daeho Jeong wrote:
> > > > + for (index = pg_start; index < pg_end;) {
> > > > + struct dnode_of_data dn;
> > > > + unsigned int end_offset;
> > > > +
> > > > + set_new_dnode(&dn, inode, NULL, NULL, 0);
> > > > + ret = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
> > > > + if (ret)
> > > > + goto out;
> > > > +
> > > > + end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
> > > > + if (pg_end < end_offset + index)
> > > > + end_offset = pg_end - index;
> > > > +
> > > > + for (; dn.ofs_in_node < end_offset;
> > > > + dn.ofs_in_node++, index++) {
> > > > + struct block_device *cur_bdev;
> > > > + block_t blkaddr = f2fs_data_blkaddr(&dn);
> > > > +
> > > > + if (__is_valid_data_blkaddr(blkaddr)) {
> > > > + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
> > > > + blkaddr, DATA_GENERIC_ENHANCE)) {
> > > > + ret = -EFSCORRUPTED;
> > > > + goto out;
> > > > + }
> > > > + } else
> > > > + continue;
> > > > +
> > > > + cur_bdev = f2fs_target_device(sbi, blkaddr, NULL);
> > > > + if (f2fs_is_multi_device(sbi)) {
> > > > + int i = f2fs_target_device_index(sbi, blkaddr);
> > > > +
> > > > + blkaddr -= FDEV(i).start_blk;
> > > > + }
> > > > +
> > > > + if (len) {
> > > > + if (prev_bdev == cur_bdev &&
> > > > + blkaddr == prev_block + len) {
> > > > + len++;
> > > > + } else {
> > > > + ret = f2fs_secure_erase(prev_bdev,
> > > > + prev_block, len, flags);
> > > > + if (ret)
> > > > + goto out;
> > > > +
> > > > + len = 0;
> > > > + }
> > > > + }
> > > > +
> > > > + if (!len) {
> > > > + prev_bdev = cur_bdev;
> > > > + prev_block = blkaddr;
> > > > + len = 1;
> > > > + }
> > > > + }
> > > > +
> > > > + f2fs_put_dnode(&dn);
> > > > + }
> > >
> > > This loop processes the entire file, which may be very large. So it could take
> > > a very long time to execute.
> > >
> > > It should at least use the following to make the task killable and preemptible:
> > >
> > > if (fatal_signal_pending(current)) {
> > > err = -EINTR;
> > > goto out;
> > > }
> > > cond_resched();
> > >
> >
> > Good point!
> >
> > > Also, perhaps this ioctl should be made incremental, i.e. take in an
> > > (offset, length) like pwrite()?
> > >
> > > - Eric
> >
> > Discard and Zeroing will be treated in a unit of block, which is 4KB
> > in F2FS case.
> > Do you really need the (offset, length) option here even if the unit
> > is a 4KB block? I guess this option might make the user even
> > inconvenienced to use this ioctl, because they have to bear 4KB
> > alignment in mind.
>
> The ioctl as currently proposed always erases the entire file, which could be
> gigabytes. That could take a very long time.
>
> I'm suggesting considering making it possible to call the ioctl multiple times
> to process the file incrementally, like you would do with write() or pwrite().
>
> That implies that for each ioctl call, the length would need to be specified
> unless it's hardcoded to 4KiB which would be very inefficient. Users would
> probably want to process a larger amount at a time, like 1 MiB, right?
>
> Likewise the offset would need to be specified as well, unless it were to be
> taken implicitly from f_pos.
>
> - Eric
On Fri, Jun 12, 2020 at 09:00:58AM +0900, Daeho Jeong wrote:
> For the incremental way of erasing, we might as well support the
> (offset, length) option in a unit of 4KiB.
>
> So, you might use this ioctl like the below. Does it work for you?
> struct f2fs_sec_trim {
> u64 startblk;
> u64 blklen;
> u32 flags;
> };
>
> sectrim.startblk = 0;
> sectrim.blklen = 256; // 1MiB
> sectrim.flags = F2FS_TRIM_FILE_DISCARD | F2FS_TRIM_FILE_ZEROOUT;
> ret = ioctl(fd, F2FS_SEC_TRIM_FILE, §rim);
Most filesystem ioctls and syscalls take offsets and lengths in bytes,
especially newer ones. This way implementations of these APIs can support other
alignments later. The implementation can still require alignment for now, i.e.
return -EINVAL if !IS_ALIGNED(arg.pos | arg.len, sb->s_blocksize).
Also, flags should be a u64, especially since it wouldn't actually increase the
size of the struct (due to padding).
- Eric