Zero out filename field when file is deleted. Also, add mount option
nowipe_filename to disable this filename wipeout if desired.
Signed-off-by: Leah Rumancik <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/namei.c | 4 ++++
fs/ext4/super.c | 11 ++++++++++-
3 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 826a56e3bbd2..8011418176bc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1247,6 +1247,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 883e2a7cd4ab..ae6ecabd4d97 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
else
de->inode = 0;
inode_inc_iversion(dir);
+
+ if (test_opt2(dir->i_sb, WIPE_FILENAME))
+ memset(de_del->name, 0, de_del->name_len);
+
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b9693680463a..5e8737b3f171 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1688,7 +1688,7 @@ enum {
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
- Opt_prefetch_block_bitmaps,
+ Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
#ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
#endif
@@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
+ {Opt_nowipe_filename, "nowipe_filename"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
@@ -2007,6 +2008,8 @@ static const struct mount_opts {
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
+ MOPT_EXT4_ONLY},
{Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
MOPT_SET},
#ifdef CONFIG_EXT4_DEBUG
@@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
+
+ if (!test_opt2(sb, WIPE_FILENAME))
+ SEQ_OPTS_PUTS("nowipe_filename");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_DISCARD)
set_opt(sb, DISCARD);
+ set_opt2(sb, WIPE_FILENAME);
+
sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
--
2.31.0.291.g576ba9dcdaf-goog
ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
discard journal log blocks. With the filename wipeout patch, if the discard
mount option is set, Ext4 guarantees that all data will be discarded on
deletion. This ioctl allows for periodically discarding journal contents
too.
Also, add journal discard (if discard supported) during journal load
after recovery. This provides a potential solution to
https://lore.kernel.org/linux-ext4/[email protected]/ for
disks that support discard. After a successful journal recovery, e2fsck can
call this ioctl to discard the journal as well.
Signed-off-by: Leah Rumancik <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/ioctl.c | 28 +++++++++++
fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
include/linux/jbd2.h | 1 +
4 files changed, 143 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8011418176bc..92c039ebcba7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -724,6 +724,7 @@ enum {
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
+#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2cf35066f46..1d3636c1de3b 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EOPNOTSUPP;
return fsverity_ioctl_read_metadata(filp,
(const void __user *)arg);
+ case EXT4_FLUSH_JOURNAL:
+ {
+ int discard = 0, err = 0;
+
+ /* file argument is not the mount point */
+ if (file_dentry(filp) != sb->s_root)
+ return -EINVAL;
+
+ /* filesystem is not backed by block device */
+ if (sb->s_bdev == NULL)
+ return -EINVAL;
+
+ if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
+ return -EFAULT;
+
+ if (EXT4_SB(sb)->s_journal) {
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+
+ if (discard)
+ err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
+ else
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+ return err;
+ }
default:
return -ENOTTY;
@@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_GET_ES_CACHE:
case FS_IOC_FSGETXATTR:
case FS_IOC_FSSETXATTR:
+ case EXT4_FLUSH_JOURNAL:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2dc944442802..9718512e7178 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_forget);
EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
EXPORT_SYMBOL(jbd2_journal_revoke);
EXPORT_SYMBOL(jbd2_journal_init_dev);
@@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
write_unlock(&journal->j_state_lock);
}
+/* discard journal blocks excluding journal superblock */
+static int __jbd2_journal_issue_discard(journal_t *journal)
+{
+ int err = 0;
+ unsigned long block, log_offset;
+ unsigned long long phys_block, block_start, block_stop;
+ loff_t byte_start, byte_stop, byte_count;
+ struct request_queue *q = bdev_get_queue(journal->j_dev);
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /* lookup block mapping and issue discard for each contiguous region */
+ log_offset = be32_to_cpu(journal->j_superblock->s_first);
+
+ err = jbd2_journal_bmap(journal, log_offset, &block_start);
+ if (err) {
+ printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
+ return err;
+ }
+
+ /*
+ * use block_start - 1 to meet check for contiguous with previous region:
+ * phys_block == block_stop + 1
+ */
+ block_stop = block_start - 1;
+
+ for (block = log_offset; block < journal->j_total_len; block++) {
+ err = jbd2_journal_bmap(journal, block, &phys_block);
+ if (err) {
+ printk(KERN_ERR "JBD2: bad block at offset %lu", block);
+ return err;
+ }
+
+ /*
+ * if block is last block, update stopping point
+ * if not last block and
+ * block is contiguous with previous block, continue
+ */
+ if (block == journal->j_total_len - 1)
+ block_stop = phys_block;
+ else if (phys_block == block_stop + 1) {
+ block_stop++;
+ continue;
+ }
+
+ /*
+ * if not contiguous with prior physical block or this is last
+ * block of journal, take care of the region
+ */
+ byte_start = block_start * journal->j_blocksize;
+ byte_stop = block_stop * journal->j_blocksize;
+ byte_count = (block_stop - block_start + 1) *
+ journal->j_blocksize;
+
+ truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+ byte_start, byte_stop);
+
+ /*
+ * use blkdev_issue_discard instead of sb_issue_discard
+ * because superblock not yet populated when this is
+ * called during journal_load during mount process
+ */
+ err = blkdev_issue_discard(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+
+ if (unlikely(err != 0)) {
+ printk(KERN_ERR "JBD2: unable to discard "
+ "journal at physical blocks %llu - %llu",
+ block_start, block_stop);
+ return err;
+ }
+
+ block_start = phys_block;
+ block_stop = phys_block;
+ }
+
+ return blkdev_issue_flush(journal->j_dev);
+}
/**
* jbd2_journal_update_sb_errno() - Update error in the journal.
@@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
{
int err;
journal_superblock_t *sb;
+ struct request_queue *q = bdev_get_queue(journal->j_dev);
err = load_superblock(journal);
if (err)
@@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
*/
journal->j_flags &= ~JBD2_ABORT;
+ /* if journal device supports discard, discard journal blocks */
+ if (q && blk_queue_discard(q)) {
+ if (__jbd2_journal_issue_discard(journal))
+ printk(KERN_ERR "JBD2: failed to discard journal when loading");
+ }
+
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
@@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * jbd2_journal_flush() - Flush journal
+ * __jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
+ * @discard: flag (see below)
*
* Flush all data for a given journal to disk and empty the journal.
* Filesystems can use this when remounting readonly to ensure that
* recovery does not need to happen on remount.
+ *
+ * If 'discard' is false, the journal is simply flushed. If discard is true,
+ * the journal is also discarded.
*/
-
-int jbd2_journal_flush(journal_t *journal)
+static int __jbd2_journal_flush(journal_t *journal, bool discard)
{
int err = 0;
transaction_t *transaction = NULL;
@@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
* commits of data to the journal will restore the current
* s_start value. */
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+
+ if (discard)
+ err = __jbd2_journal_issue_discard(journal);
+
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
return err;
}
+int jbd2_journal_flush(journal_t *journal)
+{
+ return __jbd2_journal_flush(journal, false /* don't discard */);
+}
+
+/* flush journal and discard journal log */
+int jbd2_journal_flush_and_discard(journal_t *journal)
+{
+ return __jbd2_journal_flush(journal, true /* also discard */);
+}
+
/**
* jbd2_journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 99d3cd051ac3..9bed34e9a273 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
extern int jbd2_journal_stop(handle_t *);
extern int jbd2_journal_flush (journal_t *);
+extern int jbd2_journal_flush_and_discard(journal_t *journal);
extern void jbd2_journal_lock_updates (journal_t *);
extern void jbd2_journal_unlock_updates (journal_t *);
--
2.31.0.291.g576ba9dcdaf-goog
On Thu, Mar 25, 2021 at 06:12:19PM +0000, Leah Rumancik wrote:
> Zero out filename field when file is deleted. Also, add mount option
> nowipe_filename to disable this filename wipeout if desired.
>
> Signed-off-by: Leah Rumancik <[email protected]>
I'm totally cribbing off of the previous decade's dirname wipe patch[1].
[1] https://lore.kernel.org/linux-ext4/[email protected]/T/#ma0442145ca50bb6e62f8e3502d607c758dd24418
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/namei.c | 4 ++++
> fs/ext4/super.c | 11 ++++++++++-
> 3 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 826a56e3bbd2..8011418176bc 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
> #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
> #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
> #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
> +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
>
>
> #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 883e2a7cd4ab..ae6ecabd4d97 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
> else
> de->inode = 0;
> inode_inc_iversion(dir);
> +
> + if (test_opt2(dir->i_sb, WIPE_FILENAME))
> + memset(de_del->name, 0, de_del->name_len);
You probably ought to erase the file type information too.
> +
> return 0;
> }
> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b9693680463a..5e8737b3f171 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1688,7 +1688,7 @@ enum {
> Opt_dioread_nolock, Opt_dioread_lock,
> Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> - Opt_prefetch_block_bitmaps,
> + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
> #ifdef CONFIG_EXT4_DEBUG
> Opt_fc_debug_max_replay, Opt_fc_debug_force
> #endif
> @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
> {Opt_test_dummy_encryption, "test_dummy_encryption"},
> {Opt_inlinecrypt, "inlinecrypt"},
> {Opt_nombcache, "nombcache"},
> + {Opt_nowipe_filename, "nowipe_filename"},
> {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
> {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
> {Opt_removed, "check=none"}, /* mount option from ext2/3 */
> @@ -2007,6 +2008,8 @@ static const struct mount_opts {
> {Opt_max_dir_size_kb, 0, MOPT_GTE0},
> {Opt_test_dummy_encryption, 0, MOPT_STRING},
> {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
> + MOPT_EXT4_ONLY},
> {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
> MOPT_SET},
> #ifdef CONFIG_EXT4_DEBUG
> @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
> } else if (test_opt2(sb, DAX_INODE)) {
> SEQ_OPTS_PUTS("dax=inode");
> }
> +
> + if (!test_opt2(sb, WIPE_FILENAME))
> + SEQ_OPTS_PUTS("nowipe_filename");
Interesting how this time around it's a mount option...
At the risk of dredging up bad old memories, any chance you'd want to
select this if the file being unlinked has EXT4_SECRM_FL set?
Also, uh, I guess this is a change in default behavior? Now you have to
opt-out of wiping filenames? I suppose it's not a big deal
performance-wise since we're logging and writing buffers to disk anyway,
but I bet there's at last a few people who have recovered accidental
deltree invocations this way...
--D
> +
> ext4_show_quota_options(seq, sb);
> return 0;
> }
> @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> if (def_mount_opts & EXT4_DEFM_DISCARD)
> set_opt(sb, DISCARD);
>
> + set_opt2(sb, WIPE_FILENAME);
> +
> sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
> sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
> sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> --
> 2.31.0.291.g576ba9dcdaf-goog
>
On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> discard journal log blocks. With the filename wipeout patch, if the discard
> mount option is set, Ext4 guarantees that all data will be discarded on
> deletion. This ioctl allows for periodically discarding journal contents
> too.
Hrm. What is the use case here? I guess the goal is to sanitize the
ondisk log contents (even as wiping deleted filenames becomes default)
every now and then? Why do we want cleaning up the log to be an
explicitly separate step that userspace has to invoke?
(As opposed, say, to discarding the log automatically after every
journal checkpoint if a journal/mount option is set?)
> Also, add journal discard (if discard supported) during journal load
> after recovery. This provides a potential solution to
> https://lore.kernel.org/linux-ext4/[email protected]/ for
> disks that support discard. After a successful journal recovery, e2fsck can
> call this ioctl to discard the journal as well.
>
> Signed-off-by: Leah Rumancik <[email protected]>
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/ioctl.c | 28 +++++++++++
> fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> include/linux/jbd2.h | 1 +
> 4 files changed, 143 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8011418176bc..92c039ebcba7 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -724,6 +724,7 @@ enum {
> #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
>
> #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
>
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index a2cf35066f46..1d3636c1de3b 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> return -EOPNOTSUPP;
> return fsverity_ioctl_read_metadata(filp,
> (const void __user *)arg);
> + case EXT4_FLUSH_JOURNAL:
> + {
> + int discard = 0, err = 0;
> +
> + /* file argument is not the mount point */
> + if (file_dentry(filp) != sb->s_root)
> + return -EINVAL;
> +
> + /* filesystem is not backed by block device */
> + if (sb->s_bdev == NULL)
> + return -EINVAL;
> +
> + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> + return -EFAULT;
> +
> + if (EXT4_SB(sb)->s_journal) {
> + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> +
> + if (discard)
> + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> + else
> + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
Why not pass this as a flag?
--D
> +
> + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> + }
> + return err;
> + }
>
> default:
> return -ENOTTY;
> @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> case EXT4_IOC_GET_ES_CACHE:
> case FS_IOC_FSGETXATTR:
> case FS_IOC_FSSETXATTR:
> + case EXT4_FLUSH_JOURNAL:
> break;
> default:
> return -ENOIOCTLCMD;
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 2dc944442802..9718512e7178 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> EXPORT_SYMBOL(jbd2_journal_forget);
> EXPORT_SYMBOL(jbd2_journal_flush);
> +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> EXPORT_SYMBOL(jbd2_journal_revoke);
>
> EXPORT_SYMBOL(jbd2_journal_init_dev);
> @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> write_unlock(&journal->j_state_lock);
> }
>
> +/* discard journal blocks excluding journal superblock */
> +static int __jbd2_journal_issue_discard(journal_t *journal)
> +{
> + int err = 0;
> + unsigned long block, log_offset;
> + unsigned long long phys_block, block_start, block_stop;
> + loff_t byte_start, byte_stop, byte_count;
> + struct request_queue *q = bdev_get_queue(journal->j_dev);
> +
> + if (!q)
> + return -ENXIO;
> +
> + if (!blk_queue_discard(q))
> + return -EOPNOTSUPP;
> +
> + /* lookup block mapping and issue discard for each contiguous region */
> + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> +
> + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> + return err;
> + }
> +
> + /*
> + * use block_start - 1 to meet check for contiguous with previous region:
> + * phys_block == block_stop + 1
> + */
> + block_stop = block_start - 1;
> +
> + for (block = log_offset; block < journal->j_total_len; block++) {
> + err = jbd2_journal_bmap(journal, block, &phys_block);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> + return err;
> + }
> +
> + /*
> + * if block is last block, update stopping point
> + * if not last block and
> + * block is contiguous with previous block, continue
> + */
> + if (block == journal->j_total_len - 1)
> + block_stop = phys_block;
> + else if (phys_block == block_stop + 1) {
> + block_stop++;
> + continue;
> + }
> +
> + /*
> + * if not contiguous with prior physical block or this is last
> + * block of journal, take care of the region
> + */
> + byte_start = block_start * journal->j_blocksize;
> + byte_stop = block_stop * journal->j_blocksize;
> + byte_count = (block_stop - block_start + 1) *
> + journal->j_blocksize;
> +
> + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> + byte_start, byte_stop);
> +
> + /*
> + * use blkdev_issue_discard instead of sb_issue_discard
> + * because superblock not yet populated when this is
> + * called during journal_load during mount process
> + */
> + err = blkdev_issue_discard(journal->j_dev,
> + byte_start >> SECTOR_SHIFT,
> + byte_count >> SECTOR_SHIFT,
> + GFP_NOFS, 0);
> +
> + if (unlikely(err != 0)) {
> + printk(KERN_ERR "JBD2: unable to discard "
> + "journal at physical blocks %llu - %llu",
> + block_start, block_stop);
> + return err;
> + }
> +
> + block_start = phys_block;
> + block_stop = phys_block;
> + }
> +
> + return blkdev_issue_flush(journal->j_dev);
> +}
>
> /**
> * jbd2_journal_update_sb_errno() - Update error in the journal.
> @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> {
> int err;
> journal_superblock_t *sb;
> + struct request_queue *q = bdev_get_queue(journal->j_dev);
>
> err = load_superblock(journal);
> if (err)
> @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> */
> journal->j_flags &= ~JBD2_ABORT;
>
> + /* if journal device supports discard, discard journal blocks */
> + if (q && blk_queue_discard(q)) {
> + if (__jbd2_journal_issue_discard(journal))
> + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> + }
> +
> /* OK, we've finished with the dynamic journal bits:
> * reinitialise the dynamic contents of the superblock in memory
> * and reset them on disk. */
> @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> EXPORT_SYMBOL(jbd2_journal_clear_features);
>
> /**
> - * jbd2_journal_flush() - Flush journal
> + * __jbd2_journal_flush() - Flush journal
> * @journal: Journal to act on.
> + * @discard: flag (see below)
> *
> * Flush all data for a given journal to disk and empty the journal.
> * Filesystems can use this when remounting readonly to ensure that
> * recovery does not need to happen on remount.
> + *
> + * If 'discard' is false, the journal is simply flushed. If discard is true,
> + * the journal is also discarded.
> */
> -
> -int jbd2_journal_flush(journal_t *journal)
> +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> {
> int err = 0;
> transaction_t *transaction = NULL;
> @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> * commits of data to the journal will restore the current
> * s_start value. */
> jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> +
> + if (discard)
> + err = __jbd2_journal_issue_discard(journal);
> +
> mutex_unlock(&journal->j_checkpoint_mutex);
> write_lock(&journal->j_state_lock);
> J_ASSERT(!journal->j_running_transaction);
> @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> return err;
> }
>
> +int jbd2_journal_flush(journal_t *journal)
> +{
> + return __jbd2_journal_flush(journal, false /* don't discard */);
> +}
> +
> +/* flush journal and discard journal log */
> +int jbd2_journal_flush_and_discard(journal_t *journal)
> +{
> + return __jbd2_journal_flush(journal, true /* also discard */);
> +}
> +
> /**
> * jbd2_journal_wipe() - Wipe journal contents
> * @journal: Journal to act on.
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 99d3cd051ac3..9bed34e9a273 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> extern int jbd2_journal_stop(handle_t *);
> extern int jbd2_journal_flush (journal_t *);
> +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> extern void jbd2_journal_lock_updates (journal_t *);
> extern void jbd2_journal_unlock_updates (journal_t *);
>
> --
> 2.31.0.291.g576ba9dcdaf-goog
>
On Mar 25, 2021, at 12:12 PM, Leah Rumancik <[email protected]> wrote:
>
> Zero out filename field when file is deleted. Also, add mount option
> nowipe_filename to disable this filename wipeout if desired.
I would personally be against "wipe out entries on delete" as the default
behavior. I think most users would prefer that their data is maximally
recoverable, rather than the minimal security benefit of erasing deleted
content by default.
I think that Darrick made a good point that using the EXT4_SECRM_FL on
the inode gives users a good option to enable/disable this on a per
file or directory basis.
> Signed-off-by: Leah Rumancik <[email protected]>
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/namei.c | 4 ++++
> fs/ext4/super.c | 11 ++++++++++-
> 3 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 826a56e3bbd2..8011418176bc 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
> #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
> #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
> #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
> +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
>
>
> #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 883e2a7cd4ab..ae6ecabd4d97 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
> else
> de->inode = 0;
> inode_inc_iversion(dir);
> +
> + if (test_opt2(dir->i_sb, WIPE_FILENAME))
> + memset(de_del->name, 0, de_del->name_len);
> +
> return 0;
> }
> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b9693680463a..5e8737b3f171 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1688,7 +1688,7 @@ enum {
> Opt_dioread_nolock, Opt_dioread_lock,
> Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> - Opt_prefetch_block_bitmaps,
> + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
> #ifdef CONFIG_EXT4_DEBUG
> Opt_fc_debug_max_replay, Opt_fc_debug_force
> #endif
> @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
> {Opt_test_dummy_encryption, "test_dummy_encryption"},
> {Opt_inlinecrypt, "inlinecrypt"},
> {Opt_nombcache, "nombcache"},
> + {Opt_nowipe_filename, "nowipe_filename"},
> {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
> {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
> {Opt_removed, "check=none"}, /* mount option from ext2/3 */
> @@ -2007,6 +2008,8 @@ static const struct mount_opts {
> {Opt_max_dir_size_kb, 0, MOPT_GTE0},
> {Opt_test_dummy_encryption, 0, MOPT_STRING},
> {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
> + MOPT_EXT4_ONLY},
> {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
> MOPT_SET},
> #ifdef CONFIG_EXT4_DEBUG
> @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
> } else if (test_opt2(sb, DAX_INODE)) {
> SEQ_OPTS_PUTS("dax=inode");
> }
> +
> + if (!test_opt2(sb, WIPE_FILENAME))
> + SEQ_OPTS_PUTS("nowipe_filename");
> +
> ext4_show_quota_options(seq, sb);
> return 0;
> }
> @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> if (def_mount_opts & EXT4_DEFM_DISCARD)
> set_opt(sb, DISCARD);
>
> + set_opt2(sb, WIPE_FILENAME);
> +
> sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
> sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
> sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> --
> 2.31.0.291.g576ba9dcdaf-goog
>
Cheers, Andreas
On Fri, Mar 26, 2021 at 4:08 PM Andreas Dilger <[email protected]> wrote:
>
> On Mar 25, 2021, at 12:12 PM, Leah Rumancik <[email protected]> wrote:
> >
> > Zero out filename field when file is deleted. Also, add mount option
> > nowipe_filename to disable this filename wipeout if desired.
>
> I would personally be against "wipe out entries on delete" as the default
> behavior. I think most users would prefer that their data is maximally
> recoverable, rather than the minimal security benefit of erasing deleted
> content by default.
I understand that persistence of filenames provides recoverability
that users might like but I feel like that may break sooner or later.
For example, if we get directory shrinking patches[1] merged in or if
we redesign the directory htree using generic btrees (which will
hopefully support shrinking), this kind of recovery will become very
hard.
Also, I was wondering if persistence of file names was by design? or
it was there due to the way we implemented directories?
[1] https://patchwork.ozlabs.org/project/linux-ext4/list/?series=166560
Thanks,
Harshad
>
> I think that Darrick made a good point that using the EXT4_SECRM_FL on
> the inode gives users a good option to enable/disable this on a per
> file or directory basis.
>
> > Signed-off-by: Leah Rumancik <[email protected]>
> > ---
> > fs/ext4/ext4.h | 1 +
> > fs/ext4/namei.c | 4 ++++
> > fs/ext4/super.c | 11 ++++++++++-
> > 3 files changed, 15 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 826a56e3bbd2..8011418176bc 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
> > #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
> > #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
> > #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
> > +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
> >
> >
> > #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
> > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> > index 883e2a7cd4ab..ae6ecabd4d97 100644
> > --- a/fs/ext4/namei.c
> > +++ b/fs/ext4/namei.c
> > @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
> > else
> > de->inode = 0;
> > inode_inc_iversion(dir);
> > +
> > + if (test_opt2(dir->i_sb, WIPE_FILENAME))
> > + memset(de_del->name, 0, de_del->name_len);
> > +
> > return 0;
> > }
> > i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> > diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> > index b9693680463a..5e8737b3f171 100644
> > --- a/fs/ext4/super.c
> > +++ b/fs/ext4/super.c
> > @@ -1688,7 +1688,7 @@ enum {
> > Opt_dioread_nolock, Opt_dioread_lock,
> > Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> > Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> > - Opt_prefetch_block_bitmaps,
> > + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
> > #ifdef CONFIG_EXT4_DEBUG
> > Opt_fc_debug_max_replay, Opt_fc_debug_force
> > #endif
> > @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
> > {Opt_test_dummy_encryption, "test_dummy_encryption"},
> > {Opt_inlinecrypt, "inlinecrypt"},
> > {Opt_nombcache, "nombcache"},
> > + {Opt_nowipe_filename, "nowipe_filename"},
> > {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
> > {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
> > {Opt_removed, "check=none"}, /* mount option from ext2/3 */
> > @@ -2007,6 +2008,8 @@ static const struct mount_opts {
> > {Opt_max_dir_size_kb, 0, MOPT_GTE0},
> > {Opt_test_dummy_encryption, 0, MOPT_STRING},
> > {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> > + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
> > + MOPT_EXT4_ONLY},
> > {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
> > MOPT_SET},
> > #ifdef CONFIG_EXT4_DEBUG
> > @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
> > } else if (test_opt2(sb, DAX_INODE)) {
> > SEQ_OPTS_PUTS("dax=inode");
> > }
> > +
> > + if (!test_opt2(sb, WIPE_FILENAME))
> > + SEQ_OPTS_PUTS("nowipe_filename");
> > +
> > ext4_show_quota_options(seq, sb);
> > return 0;
> > }
> > @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> > if (def_mount_opts & EXT4_DEFM_DISCARD)
> > set_opt(sb, DISCARD);
> >
> > + set_opt2(sb, WIPE_FILENAME);
> > +
> > sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
> > sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
> > sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> > --
> > 2.31.0.291.g576ba9dcdaf-goog
> >
>
>
> Cheers, Andreas
>
>
>
>
>
On Fri, Mar 26, 2021 at 06:43:52PM -0700, harshad shirwadkar wrote:
> On Fri, Mar 26, 2021 at 4:08 PM Andreas Dilger <[email protected]> wrote:
> >
> > On Mar 25, 2021, at 12:12 PM, Leah Rumancik <[email protected]> wrote:
> > >
> > > Zero out filename field when file is deleted. Also, add mount option
> > > nowipe_filename to disable this filename wipeout if desired.
> >
> > I would personally be against "wipe out entries on delete" as the default
> > behavior. I think most users would prefer that their data is maximally
> > recoverable, rather than the minimal security benefit of erasing deleted
> > content by default.
> I understand that persistence of filenames provides recoverability
> that users might like but I feel like that may break sooner or later.
> For example, if we get directory shrinking patches[1] merged in or if
> we redesign the directory htree using generic btrees (which will
> hopefully support shrinking), this kind of recovery will become very
> hard.
>
> Also, I was wondering if persistence of file names was by design? or
> it was there due to the way we implemented directories?
I bet it wasn't done by design -- afaict all the recovery tools are
totally opportunistic in that /if/ they find something that looks like a
directory entry, /then/ it will pick that up. The names will eventually
get overwritten, so that's the best they can do.
(I would also wager that people don't like opt-out for new behaviors
unless you're adding it as part of a new feature...)
--D
> [1] https://patchwork.ozlabs.org/project/linux-ext4/list/?series=166560
>
> Thanks,
> Harshad
> >
> > I think that Darrick made a good point that using the EXT4_SECRM_FL on
> > the inode gives users a good option to enable/disable this on a per
> > file or directory basis.
> >
> > > Signed-off-by: Leah Rumancik <[email protected]>
> > > ---
> > > fs/ext4/ext4.h | 1 +
> > > fs/ext4/namei.c | 4 ++++
> > > fs/ext4/super.c | 11 ++++++++++-
> > > 3 files changed, 15 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index 826a56e3bbd2..8011418176bc 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
> > > #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
> > > #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
> > > #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
> > > +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
> > >
> > >
> > > #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
> > > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> > > index 883e2a7cd4ab..ae6ecabd4d97 100644
> > > --- a/fs/ext4/namei.c
> > > +++ b/fs/ext4/namei.c
> > > @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
> > > else
> > > de->inode = 0;
> > > inode_inc_iversion(dir);
> > > +
> > > + if (test_opt2(dir->i_sb, WIPE_FILENAME))
> > > + memset(de_del->name, 0, de_del->name_len);
> > > +
> > > return 0;
> > > }
> > > i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> > > diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> > > index b9693680463a..5e8737b3f171 100644
> > > --- a/fs/ext4/super.c
> > > +++ b/fs/ext4/super.c
> > > @@ -1688,7 +1688,7 @@ enum {
> > > Opt_dioread_nolock, Opt_dioread_lock,
> > > Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> > > Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> > > - Opt_prefetch_block_bitmaps,
> > > + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
> > > #ifdef CONFIG_EXT4_DEBUG
> > > Opt_fc_debug_max_replay, Opt_fc_debug_force
> > > #endif
> > > @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
> > > {Opt_test_dummy_encryption, "test_dummy_encryption"},
> > > {Opt_inlinecrypt, "inlinecrypt"},
> > > {Opt_nombcache, "nombcache"},
> > > + {Opt_nowipe_filename, "nowipe_filename"},
> > > {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
> > > {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
> > > {Opt_removed, "check=none"}, /* mount option from ext2/3 */
> > > @@ -2007,6 +2008,8 @@ static const struct mount_opts {
> > > {Opt_max_dir_size_kb, 0, MOPT_GTE0},
> > > {Opt_test_dummy_encryption, 0, MOPT_STRING},
> > > {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> > > + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
> > > + MOPT_EXT4_ONLY},
> > > {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
> > > MOPT_SET},
> > > #ifdef CONFIG_EXT4_DEBUG
> > > @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
> > > } else if (test_opt2(sb, DAX_INODE)) {
> > > SEQ_OPTS_PUTS("dax=inode");
> > > }
> > > +
> > > + if (!test_opt2(sb, WIPE_FILENAME))
> > > + SEQ_OPTS_PUTS("nowipe_filename");
> > > +
> > > ext4_show_quota_options(seq, sb);
> > > return 0;
> > > }
> > > @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> > > if (def_mount_opts & EXT4_DEFM_DISCARD)
> > > set_opt(sb, DISCARD);
> > >
> > > + set_opt2(sb, WIPE_FILENAME);
> > > +
> > > sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
> > > sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
> > > sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> > > --
> > > 2.31.0.291.g576ba9dcdaf-goog
> > >
> >
> >
> > Cheers, Andreas
> >
> >
> >
> >
> >
On Thu, Mar 25, 2021 at 06:21:46PM -0700, Darrick J. Wong wrote:
> On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> > ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> > discard journal log blocks. With the filename wipeout patch, if the discard
> > mount option is set, Ext4 guarantees that all data will be discarded on
> > deletion. This ioctl allows for periodically discarding journal contents
> > too.
>
> Hrm. What is the use case here? I guess the goal is to sanitize the
> ondisk log contents (even as wiping deleted filenames becomes default)
> every now and then? Why do we want cleaning up the log to be an
> explicitly separate step that userspace has to invoke?
>
> (As opposed, say, to discarding the log automatically after every
> journal checkpoint if a journal/mount option is set?)
The goal here is to be able to ensure everything is sanitized at a
particular point in time. If done automatically through the checkpoint,
there is no guarantee as to how often / when the sanitizing is
performed.
>
> > Also, add journal discard (if discard supported) during journal load
> > after recovery. This provides a potential solution to
> > https://lore.kernel.org/linux-ext4/[email protected]/ for
> > disks that support discard. After a successful journal recovery, e2fsck can
> > call this ioctl to discard the journal as well.
> >
> > Signed-off-by: Leah Rumancik <[email protected]>
> > ---
> > fs/ext4/ext4.h | 1 +
> > fs/ext4/ioctl.c | 28 +++++++++++
> > fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> > include/linux/jbd2.h | 1 +
> > 4 files changed, 143 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 8011418176bc..92c039ebcba7 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -724,6 +724,7 @@ enum {
> > #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> > #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> > #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> > +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
> >
> > #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
> >
> > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> > index a2cf35066f46..1d3636c1de3b 100644
> > --- a/fs/ext4/ioctl.c
> > +++ b/fs/ext4/ioctl.c
> > @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > return -EOPNOTSUPP;
> > return fsverity_ioctl_read_metadata(filp,
> > (const void __user *)arg);
> > + case EXT4_FLUSH_JOURNAL:
> > + {
> > + int discard = 0, err = 0;
> > +
> > + /* file argument is not the mount point */
> > + if (file_dentry(filp) != sb->s_root)
> > + return -EINVAL;
> > +
> > + /* filesystem is not backed by block device */
> > + if (sb->s_bdev == NULL)
> > + return -EINVAL;
> > +
> > + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> > + return -EFAULT;
> > +
> > + if (EXT4_SB(sb)->s_journal) {
> > + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> > +
> > + if (discard)
> > + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> > + else
> > + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
>
> Why not pass this as a flag?
>
> --D
>
Yes sure, that would make it simpler. I will update it.
> > +
> > + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> > + }
> > + return err;
> > + }
> >
> > default:
> > return -ENOTTY;
> > @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > case EXT4_IOC_GET_ES_CACHE:
> > case FS_IOC_FSGETXATTR:
> > case FS_IOC_FSSETXATTR:
> > + case EXT4_FLUSH_JOURNAL:
> > break;
> > default:
> > return -ENOIOCTLCMD;
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index 2dc944442802..9718512e7178 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> > EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> > EXPORT_SYMBOL(jbd2_journal_forget);
> > EXPORT_SYMBOL(jbd2_journal_flush);
> > +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> > EXPORT_SYMBOL(jbd2_journal_revoke);
> >
> > EXPORT_SYMBOL(jbd2_journal_init_dev);
> > @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > write_unlock(&journal->j_state_lock);
> > }
> >
> > +/* discard journal blocks excluding journal superblock */
> > +static int __jbd2_journal_issue_discard(journal_t *journal)
> > +{
> > + int err = 0;
> > + unsigned long block, log_offset;
> > + unsigned long long phys_block, block_start, block_stop;
> > + loff_t byte_start, byte_stop, byte_count;
> > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > +
> > + if (!q)
> > + return -ENXIO;
> > +
> > + if (!blk_queue_discard(q))
> > + return -EOPNOTSUPP;
> > +
> > + /* lookup block mapping and issue discard for each contiguous region */
> > + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> > +
> > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > + return err;
> > + }
> > +
> > + /*
> > + * use block_start - 1 to meet check for contiguous with previous region:
> > + * phys_block == block_stop + 1
> > + */
> > + block_stop = block_start - 1;
> > +
> > + for (block = log_offset; block < journal->j_total_len; block++) {
> > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > + return err;
> > + }
> > +
> > + /*
> > + * if block is last block, update stopping point
> > + * if not last block and
> > + * block is contiguous with previous block, continue
> > + */
> > + if (block == journal->j_total_len - 1)
> > + block_stop = phys_block;
> > + else if (phys_block == block_stop + 1) {
> > + block_stop++;
> > + continue;
> > + }
> > +
> > + /*
> > + * if not contiguous with prior physical block or this is last
> > + * block of journal, take care of the region
> > + */
> > + byte_start = block_start * journal->j_blocksize;
> > + byte_stop = block_stop * journal->j_blocksize;
> > + byte_count = (block_stop - block_start + 1) *
> > + journal->j_blocksize;
> > +
> > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > + byte_start, byte_stop);
> > +
> > + /*
> > + * use blkdev_issue_discard instead of sb_issue_discard
> > + * because superblock not yet populated when this is
> > + * called during journal_load during mount process
> > + */
> > + err = blkdev_issue_discard(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > +
> > + if (unlikely(err != 0)) {
> > + printk(KERN_ERR "JBD2: unable to discard "
> > + "journal at physical blocks %llu - %llu",
> > + block_start, block_stop);
> > + return err;
> > + }
> > +
> > + block_start = phys_block;
> > + block_stop = phys_block;
> > + }
> > +
> > + return blkdev_issue_flush(journal->j_dev);
> > +}
> >
> > /**
> > * jbd2_journal_update_sb_errno() - Update error in the journal.
> > @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> > {
> > int err;
> > journal_superblock_t *sb;
> > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> >
> > err = load_superblock(journal);
> > if (err)
> > @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> > */
> > journal->j_flags &= ~JBD2_ABORT;
> >
> > + /* if journal device supports discard, discard journal blocks */
> > + if (q && blk_queue_discard(q)) {
> > + if (__jbd2_journal_issue_discard(journal))
> > + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> > + }
> > +
> > /* OK, we've finished with the dynamic journal bits:
> > * reinitialise the dynamic contents of the superblock in memory
> > * and reset them on disk. */
> > @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> > EXPORT_SYMBOL(jbd2_journal_clear_features);
> >
> > /**
> > - * jbd2_journal_flush() - Flush journal
> > + * __jbd2_journal_flush() - Flush journal
> > * @journal: Journal to act on.
> > + * @discard: flag (see below)
> > *
> > * Flush all data for a given journal to disk and empty the journal.
> > * Filesystems can use this when remounting readonly to ensure that
> > * recovery does not need to happen on remount.
> > + *
> > + * If 'discard' is false, the journal is simply flushed. If discard is true,
> > + * the journal is also discarded.
> > */
> > -
> > -int jbd2_journal_flush(journal_t *journal)
> > +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> > {
> > int err = 0;
> > transaction_t *transaction = NULL;
> > @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> > * commits of data to the journal will restore the current
> > * s_start value. */
> > jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> > +
> > + if (discard)
> > + err = __jbd2_journal_issue_discard(journal);
> > +
> > mutex_unlock(&journal->j_checkpoint_mutex);
> > write_lock(&journal->j_state_lock);
> > J_ASSERT(!journal->j_running_transaction);
> > @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> > return err;
> > }
> >
> > +int jbd2_journal_flush(journal_t *journal)
> > +{
> > + return __jbd2_journal_flush(journal, false /* don't discard */);
> > +}
> > +
> > +/* flush journal and discard journal log */
> > +int jbd2_journal_flush_and_discard(journal_t *journal)
> > +{
> > + return __jbd2_journal_flush(journal, true /* also discard */);
> > +}
> > +
> > /**
> > * jbd2_journal_wipe() - Wipe journal contents
> > * @journal: Journal to act on.
> > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > index 99d3cd051ac3..9bed34e9a273 100644
> > --- a/include/linux/jbd2.h
> > +++ b/include/linux/jbd2.h
> > @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> > extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> > extern int jbd2_journal_stop(handle_t *);
> > extern int jbd2_journal_flush (journal_t *);
> > +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> > extern void jbd2_journal_lock_updates (journal_t *);
> > extern void jbd2_journal_unlock_updates (journal_t *);
> >
> > --
> > 2.31.0.291.g576ba9dcdaf-goog
> >
Thanks for the comments :)
On Fri, Mar 26, 2021 at 07:08:23PM -0700, Darrick J. Wong wrote:
> On Fri, Mar 26, 2021 at 06:43:52PM -0700, harshad shirwadkar wrote:
> > On Fri, Mar 26, 2021 at 4:08 PM Andreas Dilger <[email protected]> wrote:
> > >
> > > On Mar 25, 2021, at 12:12 PM, Leah Rumancik <[email protected]> wrote:
> > > >
> > > > Zero out filename field when file is deleted. Also, add mount option
> > > > nowipe_filename to disable this filename wipeout if desired.
> > >
> > > I would personally be against "wipe out entries on delete" as the default
> > > behavior. I think most users would prefer that their data is maximally
> > > recoverable, rather than the minimal security benefit of erasing deleted
> > > content by default.
> > I understand that persistence of filenames provides recoverability
> > that users might like but I feel like that may break sooner or later.
> > For example, if we get directory shrinking patches[1] merged in or if
> > we redesign the directory htree using generic btrees (which will
> > hopefully support shrinking), this kind of recovery will become very
> > hard.
> >
> > Also, I was wondering if persistence of file names was by design? or
> > it was there due to the way we implemented directories?
>
> I bet it wasn't done by design -- afaict all the recovery tools are
> totally opportunistic in that /if/ they find something that looks like a
> directory entry, /then/ it will pick that up. The names will eventually
> get overwritten, so that's the best they can do.
>
> (I would also wager that people don't like opt-out for new behaviors
> unless you're adding it as part of a new feature...)
>
> --D
What about a mount option to enable the filename wipe (with the wiping
disabled by default)?
>
> > [1] https://patchwork.ozlabs.org/project/linux-ext4/list/?series=166560
> >
> > Thanks,
> > Harshad
> > >
> > > I think that Darrick made a good point that using the EXT4_SECRM_FL on
> > > the inode gives users a good option to enable/disable this on a per
> > > file or directory basis.
> > >
> > > > Signed-off-by: Leah Rumancik <[email protected]>
> > > > ---
> > > > fs/ext4/ext4.h | 1 +
> > > > fs/ext4/namei.c | 4 ++++
> > > > fs/ext4/super.c | 11 ++++++++++-
> > > > 3 files changed, 15 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > > index 826a56e3bbd2..8011418176bc 100644
> > > > --- a/fs/ext4/ext4.h
> > > > +++ b/fs/ext4/ext4.h
> > > > @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
> > > > #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
> > > > #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
> > > > #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
> > > > +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
> > > >
> > > >
> > > > #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
> > > > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> > > > index 883e2a7cd4ab..ae6ecabd4d97 100644
> > > > --- a/fs/ext4/namei.c
> > > > +++ b/fs/ext4/namei.c
> > > > @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
> > > > else
> > > > de->inode = 0;
> > > > inode_inc_iversion(dir);
> > > > +
> > > > + if (test_opt2(dir->i_sb, WIPE_FILENAME))
> > > > + memset(de_del->name, 0, de_del->name_len);
> > > > +
> > > > return 0;
> > > > }
> > > > i += ext4_rec_len_from_disk(de->rec_len, blocksize);
> > > > diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> > > > index b9693680463a..5e8737b3f171 100644
> > > > --- a/fs/ext4/super.c
> > > > +++ b/fs/ext4/super.c
> > > > @@ -1688,7 +1688,7 @@ enum {
> > > > Opt_dioread_nolock, Opt_dioread_lock,
> > > > Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
> > > > Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
> > > > - Opt_prefetch_block_bitmaps,
> > > > + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
> > > > #ifdef CONFIG_EXT4_DEBUG
> > > > Opt_fc_debug_max_replay, Opt_fc_debug_force
> > > > #endif
> > > > @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
> > > > {Opt_test_dummy_encryption, "test_dummy_encryption"},
> > > > {Opt_inlinecrypt, "inlinecrypt"},
> > > > {Opt_nombcache, "nombcache"},
> > > > + {Opt_nowipe_filename, "nowipe_filename"},
> > > > {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
> > > > {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
> > > > {Opt_removed, "check=none"}, /* mount option from ext2/3 */
> > > > @@ -2007,6 +2008,8 @@ static const struct mount_opts {
> > > > {Opt_max_dir_size_kb, 0, MOPT_GTE0},
> > > > {Opt_test_dummy_encryption, 0, MOPT_STRING},
> > > > {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
> > > > + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
> > > > + MOPT_EXT4_ONLY},
> > > > {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
> > > > MOPT_SET},
> > > > #ifdef CONFIG_EXT4_DEBUG
> > > > @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
> > > > } else if (test_opt2(sb, DAX_INODE)) {
> > > > SEQ_OPTS_PUTS("dax=inode");
> > > > }
> > > > +
> > > > + if (!test_opt2(sb, WIPE_FILENAME))
> > > > + SEQ_OPTS_PUTS("nowipe_filename");
> > > > +
> > > > ext4_show_quota_options(seq, sb);
> > > > return 0;
> > > > }
> > > > @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> > > > if (def_mount_opts & EXT4_DEFM_DISCARD)
> > > > set_opt(sb, DISCARD);
> > > >
> > > > + set_opt2(sb, WIPE_FILENAME);
> > > > +
> > > > sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
> > > > sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
> > > > sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
> > > > --
> > > > 2.31.0.291.g576ba9dcdaf-goog
> > > >
> > >
> > >
> > > Cheers, Andreas
> > >
> > >
> > >
> > >
> > >
> On Mar 29, 2021, at 10:06 AM, Leah Rumancik <[email protected]> wrote:
>
> On Fri, Mar 26, 2021 at 07:08:23PM -0700, Darrick J. Wong wrote:
>> On Fri, Mar 26, 2021 at 06:43:52PM -0700, harshad shirwadkar wrote:
>>> On Fri, Mar 26, 2021 at 4:08 PM Andreas Dilger <[email protected]> wrote:
>>>>
>>>> On Mar 25, 2021, at 12:12 PM, Leah Rumancik <[email protected]> wrote:
>>>>>
>>>>> Zero out filename field when file is deleted. Also, add mount option
>>>>> nowipe_filename to disable this filename wipeout if desired.
>>>>
>>>> I would personally be against "wipe out entries on delete" as the default
>>>> behavior. I think most users would prefer that their data is maximally
>>>> recoverable, rather than the minimal security benefit of erasing deleted
>>>> content by default.
>>> I understand that persistence of filenames provides recoverability
>>> that users might like but I feel like that may break sooner or later.
>>> For example, if we get directory shrinking patches[1] merged in or if
>>> we redesign the directory htree using generic btrees (which will
>>> hopefully support shrinking), this kind of recovery will become very
>>> hard.
>>>
>>> Also, I was wondering if persistence of file names was by design? or
>>> it was there due to the way we implemented directories?
>>
>> I bet it wasn't done by design -- afaict all the recovery tools are
>> totally opportunistic in that /if/ they find something that looks like a
>> directory entry, /then/ it will pick that up. The names will eventually
>> get overwritten, so that's the best they can do.
>>
>> (I would also wager that people don't like opt-out for new behaviors
>> unless you're adding it as part of a new feature...)
>>
>> --D
>
> What about a mount option to enable the filename wipe (with the wiping
> disabled by default)?
I would be OK with a mount option to enable it globally, or EXT2_SECRM_FL
to trigger this on a per-file or per-directory basis (entry is zeroed
if this is set on either the file or the directory it is located in, and
possibly the file's data blocks are trimmed from flash).
Cheers, Andreas
>>> [1] https://patchwork.ozlabs.org/project/linux-ext4/list/?series=166560
>>>
>>> Thanks,
>>> Harshad
>>>>
>>>> I think that Darrick made a good point that using the EXT4_SECRM_FL on
>>>> the inode gives users a good option to enable/disable this on a per
>>>> file or directory basis.
>>>>
>>>>> Signed-off-by: Leah Rumancik <[email protected]>
>>>>> ---
>>>>> fs/ext4/ext4.h | 1 +
>>>>> fs/ext4/namei.c | 4 ++++
>>>>> fs/ext4/super.c | 11 ++++++++++-
>>>>> 3 files changed, 15 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>>> index 826a56e3bbd2..8011418176bc 100644
>>>>> --- a/fs/ext4/ext4.h
>>>>> +++ b/fs/ext4/ext4.h
>>>>> @@ -1247,6 +1247,7 @@ struct ext4_inode_info {
>>>>> #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
>>>>> #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
>>>>> #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
>>>>> +#define EXT4_MOUNT2_WIPE_FILENAME 0x00000080 /* Wipe filename on del entry */
>>>>>
>>>>>
>>>>> #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
>>>>> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
>>>>> index 883e2a7cd4ab..ae6ecabd4d97 100644
>>>>> --- a/fs/ext4/namei.c
>>>>> +++ b/fs/ext4/namei.c
>>>>> @@ -2492,6 +2492,10 @@ int ext4_generic_delete_entry(struct inode *dir,
>>>>> else
>>>>> de->inode = 0;
>>>>> inode_inc_iversion(dir);
>>>>> +
>>>>> + if (test_opt2(dir->i_sb, WIPE_FILENAME))
>>>>> + memset(de_del->name, 0, de_del->name_len);
>>>>> +
>>>>> return 0;
>>>>> }
>>>>> i += ext4_rec_len_from_disk(de->rec_len, blocksize);
>>>>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>>>>> index b9693680463a..5e8737b3f171 100644
>>>>> --- a/fs/ext4/super.c
>>>>> +++ b/fs/ext4/super.c
>>>>> @@ -1688,7 +1688,7 @@ enum {
>>>>> Opt_dioread_nolock, Opt_dioread_lock,
>>>>> Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
>>>>> Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
>>>>> - Opt_prefetch_block_bitmaps,
>>>>> + Opt_prefetch_block_bitmaps, Opt_nowipe_filename,
>>>>> #ifdef CONFIG_EXT4_DEBUG
>>>>> Opt_fc_debug_max_replay, Opt_fc_debug_force
>>>>> #endif
>>>>> @@ -1787,6 +1787,7 @@ static const match_table_t tokens = {
>>>>> {Opt_test_dummy_encryption, "test_dummy_encryption"},
>>>>> {Opt_inlinecrypt, "inlinecrypt"},
>>>>> {Opt_nombcache, "nombcache"},
>>>>> + {Opt_nowipe_filename, "nowipe_filename"},
>>>>> {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
>>>>> {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
>>>>> {Opt_removed, "check=none"}, /* mount option from ext2/3 */
>>>>> @@ -2007,6 +2008,8 @@ static const struct mount_opts {
>>>>> {Opt_max_dir_size_kb, 0, MOPT_GTE0},
>>>>> {Opt_test_dummy_encryption, 0, MOPT_STRING},
>>>>> {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
>>>>> + {Opt_nowipe_filename, EXT4_MOUNT2_WIPE_FILENAME, MOPT_CLEAR | MOPT_2 |
>>>>> + MOPT_EXT4_ONLY},
>>>>> {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
>>>>> MOPT_SET},
>>>>> #ifdef CONFIG_EXT4_DEBUG
>>>>> @@ -2621,6 +2624,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
>>>>> } else if (test_opt2(sb, DAX_INODE)) {
>>>>> SEQ_OPTS_PUTS("dax=inode");
>>>>> }
>>>>> +
>>>>> + if (!test_opt2(sb, WIPE_FILENAME))
>>>>> + SEQ_OPTS_PUTS("nowipe_filename");
>>>>> +
>>>>> ext4_show_quota_options(seq, sb);
>>>>> return 0;
>>>>> }
>>>>> @@ -4161,6 +4168,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>>>>> if (def_mount_opts & EXT4_DEFM_DISCARD)
>>>>> set_opt(sb, DISCARD);
>>>>>
>>>>> + set_opt2(sb, WIPE_FILENAME);
>>>>> +
>>>>> sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
>>>>> sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
>>>>> sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
>>>>> --
>>>>> 2.31.0.291.g576ba9dcdaf-goog
>>>>>
>>>>
>>>>
>>>> Cheers, Andreas
Cheers, Andreas
On Mon, Mar 29, 2021 at 8:09 AM Leah Rumancik <[email protected]> wrote:
>
> On Thu, Mar 25, 2021 at 06:21:46PM -0700, Darrick J. Wong wrote:
> > On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> > > ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> > > discard journal log blocks. With the filename wipeout patch, if the discard
> > > mount option is set, Ext4 guarantees that all data will be discarded on
> > > deletion. This ioctl allows for periodically discarding journal contents
> > > too.
> >
> > Hrm. What is the use case here? I guess the goal is to sanitize the
> > ondisk log contents (even as wiping deleted filenames becomes default)
> > every now and then? Why do we want cleaning up the log to be an
> > explicitly separate step that userspace has to invoke?
> >
> > (As opposed, say, to discarding the log automatically after every
> > journal checkpoint if a journal/mount option is set?)
>
> The goal here is to be able to ensure everything is sanitized at a
> particular point in time. If done automatically through the checkpoint,
> there is no guarantee as to how often / when the sanitizing is
> performed.
To elaborate more on that, this allows userspace to control the
frequency of journal discards and this is useful if the userspace
application is trying to put a bound on content erasing time. With
Leah's earlier patch in the series, Ext4 when mounted with the
"discard" mount option would ensure that whenever a file is deleted,
it disappears completely from the fliesystem controlled LBAs of the
underlying block device, but no such guarantee is provided for the
potential remains of the file in the journal. With this
user-controlled discard frequency of the journal, userspace can ensure
that the contents of a deleted file disappear entirely within X hours
by calling this ioctl once every X hours. This allows us to greatly
simplify journal discards - that's because now we don't have to track
the journal blocks belonging to a particular file.
Also, sending discards to the entire journal with every commit /
checkpoint may have some performance issues. So, if we want this to be
automated, one option would be to invoke this after every N commits
performed by the journal thread and make N configurable as a mount
time parameter. But, it's just simpler to call this ioctl from the
application and let it worry about the frequency, and perhaps these
applications would anyway be calling fstrim periodically to ensure
such guarantees. If they are doing that, this would just add another
step in their periodic discard routine.
- Harshad
- Harshad
>
> >
> > > Also, add journal discard (if discard supported) during journal load
> > > after recovery. This provides a potential solution to
> > > https://lore.kernel.org/linux-ext4/[email protected]/ for
> > > disks that support discard. After a successful journal recovery, e2fsck can
> > > call this ioctl to discard the journal as well.
> > >
> > > Signed-off-by: Leah Rumancik <[email protected]>
> > > ---
> > > fs/ext4/ext4.h | 1 +
> > > fs/ext4/ioctl.c | 28 +++++++++++
> > > fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> > > include/linux/jbd2.h | 1 +
> > > 4 files changed, 143 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index 8011418176bc..92c039ebcba7 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -724,6 +724,7 @@ enum {
> > > #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> > > #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> > > #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> > > +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
> > >
> > > #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
> > >
> > > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> > > index a2cf35066f46..1d3636c1de3b 100644
> > > --- a/fs/ext4/ioctl.c
> > > +++ b/fs/ext4/ioctl.c
> > > @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > > return -EOPNOTSUPP;
> > > return fsverity_ioctl_read_metadata(filp,
> > > (const void __user *)arg);
> > > + case EXT4_FLUSH_JOURNAL:
> > > + {
> > > + int discard = 0, err = 0;
> > > +
> > > + /* file argument is not the mount point */
> > > + if (file_dentry(filp) != sb->s_root)
> > > + return -EINVAL;
> > > +
> > > + /* filesystem is not backed by block device */
> > > + if (sb->s_bdev == NULL)
> > > + return -EINVAL;
> > > +
> > > + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> > > + return -EFAULT;
> > > +
> > > + if (EXT4_SB(sb)->s_journal) {
> > > + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> > > +
> > > + if (discard)
> > > + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> > > + else
> > > + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> >
> > Why not pass this as a flag?
> >
> > --D
> >
>
> Yes sure, that would make it simpler. I will update it.
>
> > > +
> > > + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> > > + }
> > > + return err;
> > > + }
> > >
> > > default:
> > > return -ENOTTY;
> > > @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > > case EXT4_IOC_GET_ES_CACHE:
> > > case FS_IOC_FSGETXATTR:
> > > case FS_IOC_FSSETXATTR:
> > > + case EXT4_FLUSH_JOURNAL:
> > > break;
> > > default:
> > > return -ENOIOCTLCMD;
> > > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > > index 2dc944442802..9718512e7178 100644
> > > --- a/fs/jbd2/journal.c
> > > +++ b/fs/jbd2/journal.c
> > > @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> > > EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> > > EXPORT_SYMBOL(jbd2_journal_forget);
> > > EXPORT_SYMBOL(jbd2_journal_flush);
> > > +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> > > EXPORT_SYMBOL(jbd2_journal_revoke);
> > >
> > > EXPORT_SYMBOL(jbd2_journal_init_dev);
> > > @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > > write_unlock(&journal->j_state_lock);
> > > }
> > >
> > > +/* discard journal blocks excluding journal superblock */
> > > +static int __jbd2_journal_issue_discard(journal_t *journal)
> > > +{
> > > + int err = 0;
> > > + unsigned long block, log_offset;
> > > + unsigned long long phys_block, block_start, block_stop;
> > > + loff_t byte_start, byte_stop, byte_count;
> > > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > > +
> > > + if (!q)
> > > + return -ENXIO;
> > > +
> > > + if (!blk_queue_discard(q))
> > > + return -EOPNOTSUPP;
> > > +
> > > + /* lookup block mapping and issue discard for each contiguous region */
> > > + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> > > +
> > > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > > + if (err) {
> > > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > > + return err;
> > > + }
> > > +
> > > + /*
> > > + * use block_start - 1 to meet check for contiguous with previous region:
> > > + * phys_block == block_stop + 1
> > > + */
> > > + block_stop = block_start - 1;
> > > +
> > > + for (block = log_offset; block < journal->j_total_len; block++) {
> > > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > > + if (err) {
> > > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > > + return err;
> > > + }
> > > +
> > > + /*
> > > + * if block is last block, update stopping point
> > > + * if not last block and
> > > + * block is contiguous with previous block, continue
> > > + */
> > > + if (block == journal->j_total_len - 1)
> > > + block_stop = phys_block;
> > > + else if (phys_block == block_stop + 1) {
> > > + block_stop++;
> > > + continue;
> > > + }
> > > +
> > > + /*
> > > + * if not contiguous with prior physical block or this is last
> > > + * block of journal, take care of the region
> > > + */
> > > + byte_start = block_start * journal->j_blocksize;
> > > + byte_stop = block_stop * journal->j_blocksize;
> > > + byte_count = (block_stop - block_start + 1) *
> > > + journal->j_blocksize;
> > > +
> > > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > > + byte_start, byte_stop);
> > > +
> > > + /*
> > > + * use blkdev_issue_discard instead of sb_issue_discard
> > > + * because superblock not yet populated when this is
> > > + * called during journal_load during mount process
> > > + */
> > > + err = blkdev_issue_discard(journal->j_dev,
> > > + byte_start >> SECTOR_SHIFT,
> > > + byte_count >> SECTOR_SHIFT,
> > > + GFP_NOFS, 0);
> > > +
> > > + if (unlikely(err != 0)) {
> > > + printk(KERN_ERR "JBD2: unable to discard "
> > > + "journal at physical blocks %llu - %llu",
> > > + block_start, block_stop);
> > > + return err;
> > > + }
> > > +
> > > + block_start = phys_block;
> > > + block_stop = phys_block;
> > > + }
> > > +
> > > + return blkdev_issue_flush(journal->j_dev);
> > > +}
> > >
> > > /**
> > > * jbd2_journal_update_sb_errno() - Update error in the journal.
> > > @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> > > {
> > > int err;
> > > journal_superblock_t *sb;
> > > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > >
> > > err = load_superblock(journal);
> > > if (err)
> > > @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> > > */
> > > journal->j_flags &= ~JBD2_ABORT;
> > >
> > > + /* if journal device supports discard, discard journal blocks */
> > > + if (q && blk_queue_discard(q)) {
> > > + if (__jbd2_journal_issue_discard(journal))
> > > + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> > > + }
> > > +
> > > /* OK, we've finished with the dynamic journal bits:
> > > * reinitialise the dynamic contents of the superblock in memory
> > > * and reset them on disk. */
> > > @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> > > EXPORT_SYMBOL(jbd2_journal_clear_features);
> > >
> > > /**
> > > - * jbd2_journal_flush() - Flush journal
> > > + * __jbd2_journal_flush() - Flush journal
> > > * @journal: Journal to act on.
> > > + * @discard: flag (see below)
> > > *
> > > * Flush all data for a given journal to disk and empty the journal.
> > > * Filesystems can use this when remounting readonly to ensure that
> > > * recovery does not need to happen on remount.
> > > + *
> > > + * If 'discard' is false, the journal is simply flushed. If discard is true,
> > > + * the journal is also discarded.
> > > */
> > > -
> > > -int jbd2_journal_flush(journal_t *journal)
> > > +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> > > {
> > > int err = 0;
> > > transaction_t *transaction = NULL;
> > > @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> > > * commits of data to the journal will restore the current
> > > * s_start value. */
> > > jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> > > +
> > > + if (discard)
> > > + err = __jbd2_journal_issue_discard(journal);
> > > +
> > > mutex_unlock(&journal->j_checkpoint_mutex);
> > > write_lock(&journal->j_state_lock);
> > > J_ASSERT(!journal->j_running_transaction);
> > > @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> > > return err;
> > > }
> > >
> > > +int jbd2_journal_flush(journal_t *journal)
> > > +{
> > > + return __jbd2_journal_flush(journal, false /* don't discard */);
> > > +}
> > > +
> > > +/* flush journal and discard journal log */
> > > +int jbd2_journal_flush_and_discard(journal_t *journal)
> > > +{
> > > + return __jbd2_journal_flush(journal, true /* also discard */);
> > > +}
> > > +
> > > /**
> > > * jbd2_journal_wipe() - Wipe journal contents
> > > * @journal: Journal to act on.
> > > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > > index 99d3cd051ac3..9bed34e9a273 100644
> > > --- a/include/linux/jbd2.h
> > > +++ b/include/linux/jbd2.h
> > > @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> > > extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> > > extern int jbd2_journal_stop(handle_t *);
> > > extern int jbd2_journal_flush (journal_t *);
> > > +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> > > extern void jbd2_journal_lock_updates (journal_t *);
> > > extern void jbd2_journal_unlock_updates (journal_t *);
> > >
> > > --
> > > 2.31.0.291.g576ba9dcdaf-goog
> > >
>
> Thanks for the comments :)
On Mon, Mar 29, 2021 at 01:05:55PM -0600, Andreas Dilger wrote:
> >> I bet it wasn't done by design -- afaict all the recovery tools are
> >> totally opportunistic in that /if/ they find something that looks like a
> >> directory entry, /then/ it will pick that up. The names will eventually
> >> get overwritten, so that's the best they can do.
> >>
> >> (I would also wager that people don't like opt-out for new behaviors
> >> unless you're adding it as part of a new feature...)
It certainly wasn't deliberate, and I'll note that we've already
compromised recovery tools because of how deleted inodes were changed
when we added journalling to ext3. Previously, we just zeroed
i_links_count and then updated all of the block allocation bitmaps and
block group descriptors' free block counts.
But there's the possibility, especially for very big files, that all
of the blocks that might need to be touched for a truncate might not
fit in a single transaction. So we now handle the final iput of a
deleted files by doing a journalled truncate, which means that all of
the logical to physical mapping gets wiped after the delete is
commited. You can see this if you execute the following series of
commands:
% mke2fs -F -q -t ext2 /tmp/foo.img 1G
% sudo mount -o loop -t ext4 /tmp/foo.img /mnt
% sudo cp /etc/motd /mnt/motd
% sync
% debugfs /tmp/foo.img -R 'stat <12>'
% sudo rm /mnt/motd
% sudo umount /mnt
% debugfs /tmp/foo.img -R 'stat <12>'
... and compare the debugfs output before and after the file is
deleted. Then try the same thing mounting with ext2 instead of ext4,
with a kernel that has ext2 compiled in (as opposed to using ext4 in
compatibility mode).
This has been true ever since ext3 was released in 2001, and people
have largely not noticed or complained. It would be possible to do
better, if we wanted to better support the "Root Oops"[1] recovery
case; we could do a two-pass scan over the file, determine how many
block groups would need to be updated, and then try to open a handle
with the requisite number of credits. If that succeeds, then we can
skip zapping the extent tree or indirect blocks. Otherwise, we fall
back to the current truncate code path. But it would slow down our
performance of unlinks, and over the last two decades, as far as I
know, no one has complained.
I was the person who suggested using a mount option, but on
reflection, given that we *already* pretty much make it impossible to
recover the contents of a deleted file, do we really care about
whether the file name can be recovered? Hence, I'm beginning to think
that perhaps we shouldn't make it be a tunable at all. After all,
zeroing the directory entry is not going to cause any kind of
performance penalty, and if we add a mount option, it's one more thing
mount option we need to support forever.
If we really must make it be a tunable, a lighter weight to do this
would be to assign a new EXT2_FLAGS_xxx bit in es->s_flags, and allow
tune2fs to be able to adjust the field. But I think a strong case
could be made that even that is overkill.
Cheers,
- Ted
[1] A friend of mine back in my undergraduate days once took a scanned
image of a Kellogs Froot Loops cereal box, and did some creative image
editing to make it read "root oops", and replaced the text "Sweetened
Multigrain Cereal" with "rm is forever". I should really ask her if
she still has a copy of the image. :-)
On Mon, Mar 29, 2021 at 03:10:34PM -0700, harshad shirwadkar wrote:
> On Mon, Mar 29, 2021 at 8:09 AM Leah Rumancik <[email protected]> wrote:
> >
> > On Thu, Mar 25, 2021 at 06:21:46PM -0700, Darrick J. Wong wrote:
> > > On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> > > > ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> > > > discard journal log blocks. With the filename wipeout patch, if the discard
> > > > mount option is set, Ext4 guarantees that all data will be discarded on
> > > > deletion. This ioctl allows for periodically discarding journal contents
> > > > too.
> > >
> > > Hrm. What is the use case here? I guess the goal is to sanitize the
> > > ondisk log contents (even as wiping deleted filenames becomes default)
> > > every now and then? Why do we want cleaning up the log to be an
> > > explicitly separate step that userspace has to invoke?
> > >
> > > (As opposed, say, to discarding the log automatically after every
> > > journal checkpoint if a journal/mount option is set?)
> >
> > The goal here is to be able to ensure everything is sanitized at a
> > particular point in time. If done automatically through the checkpoint,
> > there is no guarantee as to how often / when the sanitizing is
> > performed.
Oh, ok, you want a sanitization barrier ("if this call returns zero,
everything sensitive in the journal is no longer accessible") for
$program. Got it.
It occurred to me overnight that another way to look at this ioctl
proposal is that it checkpoints the filesystem and has a flag to discard
the journal blocks too. Given that we're now only two days away from
my traditional bootfs[1] drum-banging day, and there's real user
demand[2] for bootloaders to be able to force a journal checkpoint,
I think I'll re-read this patch in that context.
[1] https://lore.kernel.org/linux-fsdevel/20190401070001.GJ1173@magnolia/
[2] https://lore.kernel.org/linux-xfs/[email protected]/
> To elaborate more on that, this allows userspace to control the
> frequency of journal discards and this is useful if the userspace
> application is trying to put a bound on content erasing time. With
> Leah's earlier patch in the series, Ext4 when mounted with the
> "discard" mount option would ensure that whenever a file is deleted,
> it disappears completely from the fliesystem controlled LBAs of the
> underlying block device, but no such guarantee is provided for the
> potential remains of the file in the journal. With this
> user-controlled discard frequency of the journal, userspace can ensure
> that the contents of a deleted file disappear entirely within X hours
> by calling this ioctl once every X hours. This allows us to greatly
> simplify journal discards - that's because now we don't have to track
> the journal blocks belonging to a particular file.
>
> Also, sending discards to the entire journal with every commit /
> checkpoint may have some performance issues.
(Heh.)
> So, if we want this to be
> automated, one option would be to invoke this after every N commits
> performed by the journal thread and make N configurable as a mount
> time parameter. But, it's just simpler to call this ioctl from the
> application and let it worry about the frequency, and perhaps these
> applications would anyway be calling fstrim periodically to ensure
> such guarantees. If they are doing that, this would just add another
> step in their periodic discard routine.
Why not make discarding the journal part of FITRIM then?
Or are there really two separate usescases here -- automated background
trimming of everything; and certain programs that want/need to make sure
that all the data stored by that program are no longer accessible via
the LBA interface, and do not want to pay the cost of cleaning the
/entire/ filesystem? (Which I think you could easily do with a third
patch to force discard-on-free behavior on files with SECRM_FL set.)
--D
> - Harshad
>
>
>
>
>
>
>
> - Harshad
> >
> > >
> > > > Also, add journal discard (if discard supported) during journal load
> > > > after recovery. This provides a potential solution to
> > > > https://lore.kernel.org/linux-ext4/[email protected]/ for
> > > > disks that support discard. After a successful journal recovery, e2fsck can
> > > > call this ioctl to discard the journal as well.
> > > >
> > > > Signed-off-by: Leah Rumancik <[email protected]>
> > > > ---
> > > > fs/ext4/ext4.h | 1 +
> > > > fs/ext4/ioctl.c | 28 +++++++++++
> > > > fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> > > > include/linux/jbd2.h | 1 +
> > > > 4 files changed, 143 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > > index 8011418176bc..92c039ebcba7 100644
> > > > --- a/fs/ext4/ext4.h
> > > > +++ b/fs/ext4/ext4.h
> > > > @@ -724,6 +724,7 @@ enum {
> > > > #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> > > > #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> > > > #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> > > > +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
> > > >
> > > > #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
> > > >
> > > > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> > > > index a2cf35066f46..1d3636c1de3b 100644
> > > > --- a/fs/ext4/ioctl.c
> > > > +++ b/fs/ext4/ioctl.c
> > > > @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > > > return -EOPNOTSUPP;
> > > > return fsverity_ioctl_read_metadata(filp,
> > > > (const void __user *)arg);
> > > > + case EXT4_FLUSH_JOURNAL:
> > > > + {
> > > > + int discard = 0, err = 0;
> > > > +
> > > > + /* file argument is not the mount point */
> > > > + if (file_dentry(filp) != sb->s_root)
> > > > + return -EINVAL;
> > > > +
> > > > + /* filesystem is not backed by block device */
> > > > + if (sb->s_bdev == NULL)
> > > > + return -EINVAL;
> > > > +
> > > > + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> > > > + return -EFAULT;
> > > > +
> > > > + if (EXT4_SB(sb)->s_journal) {
> > > > + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> > > > +
> > > > + if (discard)
> > > > + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> > > > + else
> > > > + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> > >
> > > Why not pass this as a flag?
> > >
> > > --D
> > >
> >
> > Yes sure, that would make it simpler. I will update it.
> >
> > > > +
> > > > + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> > > > + }
> > > > + return err;
> > > > + }
> > > >
> > > > default:
> > > > return -ENOTTY;
> > > > @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > > > case EXT4_IOC_GET_ES_CACHE:
> > > > case FS_IOC_FSGETXATTR:
> > > > case FS_IOC_FSSETXATTR:
> > > > + case EXT4_FLUSH_JOURNAL:
> > > > break;
> > > > default:
> > > > return -ENOIOCTLCMD;
> > > > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > > > index 2dc944442802..9718512e7178 100644
> > > > --- a/fs/jbd2/journal.c
> > > > +++ b/fs/jbd2/journal.c
> > > > @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> > > > EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> > > > EXPORT_SYMBOL(jbd2_journal_forget);
> > > > EXPORT_SYMBOL(jbd2_journal_flush);
> > > > +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> > > > EXPORT_SYMBOL(jbd2_journal_revoke);
> > > >
> > > > EXPORT_SYMBOL(jbd2_journal_init_dev);
> > > > @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > > > write_unlock(&journal->j_state_lock);
> > > > }
> > > >
> > > > +/* discard journal blocks excluding journal superblock */
> > > > +static int __jbd2_journal_issue_discard(journal_t *journal)
> > > > +{
> > > > + int err = 0;
> > > > + unsigned long block, log_offset;
> > > > + unsigned long long phys_block, block_start, block_stop;
> > > > + loff_t byte_start, byte_stop, byte_count;
> > > > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > > > +
> > > > + if (!q)
> > > > + return -ENXIO;
> > > > +
> > > > + if (!blk_queue_discard(q))
> > > > + return -EOPNOTSUPP;
> > > > +
> > > > + /* lookup block mapping and issue discard for each contiguous region */
> > > > + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> > > > +
> > > > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > > > + if (err) {
> > > > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > > > + return err;
> > > > + }
> > > > +
> > > > + /*
> > > > + * use block_start - 1 to meet check for contiguous with previous region:
> > > > + * phys_block == block_stop + 1
> > > > + */
> > > > + block_stop = block_start - 1;
> > > > +
> > > > + for (block = log_offset; block < journal->j_total_len; block++) {
> > > > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > > > + if (err) {
> > > > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > > > + return err;
> > > > + }
> > > > +
> > > > + /*
> > > > + * if block is last block, update stopping point
> > > > + * if not last block and
> > > > + * block is contiguous with previous block, continue
> > > > + */
> > > > + if (block == journal->j_total_len - 1)
> > > > + block_stop = phys_block;
> > > > + else if (phys_block == block_stop + 1) {
> > > > + block_stop++;
> > > > + continue;
> > > > + }
> > > > +
> > > > + /*
> > > > + * if not contiguous with prior physical block or this is last
> > > > + * block of journal, take care of the region
> > > > + */
> > > > + byte_start = block_start * journal->j_blocksize;
> > > > + byte_stop = block_stop * journal->j_blocksize;
> > > > + byte_count = (block_stop - block_start + 1) *
> > > > + journal->j_blocksize;
> > > > +
> > > > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > > > + byte_start, byte_stop);
> > > > +
> > > > + /*
> > > > + * use blkdev_issue_discard instead of sb_issue_discard
> > > > + * because superblock not yet populated when this is
> > > > + * called during journal_load during mount process
> > > > + */
> > > > + err = blkdev_issue_discard(journal->j_dev,
> > > > + byte_start >> SECTOR_SHIFT,
> > > > + byte_count >> SECTOR_SHIFT,
> > > > + GFP_NOFS, 0);
> > > > +
> > > > + if (unlikely(err != 0)) {
> > > > + printk(KERN_ERR "JBD2: unable to discard "
> > > > + "journal at physical blocks %llu - %llu",
> > > > + block_start, block_stop);
> > > > + return err;
> > > > + }
> > > > +
> > > > + block_start = phys_block;
> > > > + block_stop = phys_block;
> > > > + }
> > > > +
> > > > + return blkdev_issue_flush(journal->j_dev);
> > > > +}
> > > >
> > > > /**
> > > > * jbd2_journal_update_sb_errno() - Update error in the journal.
> > > > @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> > > > {
> > > > int err;
> > > > journal_superblock_t *sb;
> > > > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > > >
> > > > err = load_superblock(journal);
> > > > if (err)
> > > > @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> > > > */
> > > > journal->j_flags &= ~JBD2_ABORT;
> > > >
> > > > + /* if journal device supports discard, discard journal blocks */
> > > > + if (q && blk_queue_discard(q)) {
> > > > + if (__jbd2_journal_issue_discard(journal))
> > > > + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> > > > + }
> > > > +
> > > > /* OK, we've finished with the dynamic journal bits:
> > > > * reinitialise the dynamic contents of the superblock in memory
> > > > * and reset them on disk. */
> > > > @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> > > > EXPORT_SYMBOL(jbd2_journal_clear_features);
> > > >
> > > > /**
> > > > - * jbd2_journal_flush() - Flush journal
> > > > + * __jbd2_journal_flush() - Flush journal
> > > > * @journal: Journal to act on.
> > > > + * @discard: flag (see below)
> > > > *
> > > > * Flush all data for a given journal to disk and empty the journal.
> > > > * Filesystems can use this when remounting readonly to ensure that
> > > > * recovery does not need to happen on remount.
> > > > + *
> > > > + * If 'discard' is false, the journal is simply flushed. If discard is true,
> > > > + * the journal is also discarded.
> > > > */
> > > > -
> > > > -int jbd2_journal_flush(journal_t *journal)
> > > > +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> > > > {
> > > > int err = 0;
> > > > transaction_t *transaction = NULL;
> > > > @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> > > > * commits of data to the journal will restore the current
> > > > * s_start value. */
> > > > jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> > > > +
> > > > + if (discard)
> > > > + err = __jbd2_journal_issue_discard(journal);
> > > > +
> > > > mutex_unlock(&journal->j_checkpoint_mutex);
> > > > write_lock(&journal->j_state_lock);
> > > > J_ASSERT(!journal->j_running_transaction);
> > > > @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> > > > return err;
> > > > }
> > > >
> > > > +int jbd2_journal_flush(journal_t *journal)
> > > > +{
> > > > + return __jbd2_journal_flush(journal, false /* don't discard */);
> > > > +}
> > > > +
> > > > +/* flush journal and discard journal log */
> > > > +int jbd2_journal_flush_and_discard(journal_t *journal)
> > > > +{
> > > > + return __jbd2_journal_flush(journal, true /* also discard */);
> > > > +}
> > > > +
> > > > /**
> > > > * jbd2_journal_wipe() - Wipe journal contents
> > > > * @journal: Journal to act on.
> > > > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > > > index 99d3cd051ac3..9bed34e9a273 100644
> > > > --- a/include/linux/jbd2.h
> > > > +++ b/include/linux/jbd2.h
> > > > @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> > > > extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> > > > extern int jbd2_journal_stop(handle_t *);
> > > > extern int jbd2_journal_flush (journal_t *);
> > > > +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> > > > extern void jbd2_journal_lock_updates (journal_t *);
> > > > extern void jbd2_journal_unlock_updates (journal_t *);
> > > >
> > > > --
> > > > 2.31.0.291.g576ba9dcdaf-goog
> > > >
> >
> > Thanks for the comments :)
On Tue, Mar 30, 2021 at 09:32:23AM -0700, Darrick J. Wong wrote:
> Why not make discarding the journal part of FITRIM then?
Unfortunately, the fstrim_range structure doesn't have a place for a
flags field, and FITRIM works by specifying a range of LBA's:
struct fstrim_range {
__u64 start;
__u64 len;
__u64 minlen;
};
I suppose we could do something where some combination of start/len
means "also checkpoint and discard the journal", but that seems rather
kludgy.
> It occurred to me overnight that another way to look at this ioctl
> proposal is that it checkpoints the filesystem and has a flag to discard
> the journal blocks too. Given that we're now only two days away from
> my traditional bootfs[1] drum-banging day, and there's real user
> demand[2] for bootloaders to be able to force a journal checkpoint,
How about if we have an ioctl which is "checkpoint journal", which can
be file system independent (e.g., defined in include/uapi/linux/fs.h)
which takes a u32 flags field, where we define a flag bit to mean
"also discard the unused part of the journal after the checkpoint"?
It seems that would also solve your bootfs() use case.
- Ted
On Tue, Mar 30, 2021 at 01:15:08PM -0400, Theodore Ts'o wrote:
> On Tue, Mar 30, 2021 at 09:32:23AM -0700, Darrick J. Wong wrote:
> > Why not make discarding the journal part of FITRIM then?
>
> Unfortunately, the fstrim_range structure doesn't have a place for a
> flags field, and FITRIM works by specifying a range of LBA's:
>
> struct fstrim_range {
> __u64 start;
> __u64 len;
> __u64 minlen;
> };
>
> I suppose we could do something where some combination of start/len
> means "also checkpoint and discard the journal", but that seems rather
> kludgy.
>
> > It occurred to me overnight that another way to look at this ioctl
> > proposal is that it checkpoints the filesystem and has a flag to discard
> > the journal blocks too. Given that we're now only two days away from
> > my traditional bootfs[1] drum-banging day, and there's real user
> > demand[2] for bootloaders to be able to force a journal checkpoint,
>
> How about if we have an ioctl which is "checkpoint journal", which can
> be file system independent (e.g., defined in include/uapi/linux/fs.h)
> which takes a u32 flags field, where we define a flag bit to mean
> "also discard the unused part of the journal after the checkpoint"?
>
> It seems that would also solve your bootfs() use case.
Yeah, that's where I was going with this. I just sent a new review for
the other patch with that level of focus. :)
--D
> - Ted
On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> discard journal log blocks. With the filename wipeout patch, if the discard
> mount option is set, Ext4 guarantees that all data will be discarded on
> deletion. This ioctl allows for periodically discarding journal contents
> too.
>
> Also, add journal discard (if discard supported) during journal load
> after recovery. This provides a potential solution to
> https://lore.kernel.org/linux-ext4/[email protected]/ for
> disks that support discard. After a successful journal recovery, e2fsck can
> call this ioctl to discard the journal as well.
Ok, round 2, this time from the perspective of a adding a journal
checkpointing ioctl so that we can finally fix grub stupidity.
> Signed-off-by: Leah Rumancik <[email protected]>
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/ioctl.c | 28 +++++++++++
> fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> include/linux/jbd2.h | 1 +
> 4 files changed, 143 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 8011418176bc..92c039ebcba7 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -724,6 +724,7 @@ enum {
> #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
This really ought to be named "CHECKPOINT", not "FLUSH", because
flushing only requires persisting to stable storage somewhere. This
call does a lot more work than that, so its name ought to reflect the
fact that it checkpoints the filesystem to clean the journal and then
trims the journal blocks.
The grub bootloader has had a serious design flaw ever since it
introduced the ext4 and xfs drivers -- it ignores the journal when it's
reading a filesystem, which means that unrecovered transactions in the
journal are ignored. We (XFS anyway) /really/ don't want grub's
diminutive filesystem drivers trying to implement recovery.
Because of this inadequacy, we get sporadic complaints about grub
failing to recognize new kernel files if the system goes down
immediately after the package manager installs a new kernel, even if it
succeeds in syncfs()ing /boot afterwards. The cause, of course, is that
we /did/ flush the directory updates to disk, but they're in the journal
and the journal didn't checkpoint before the system went down.
XFS is far worse off in this category because we only tend to checkpoint
the log when the head approaches the tail; iirc ext4/jbd2 tend to
checkpoint frequently enough that I get fewer bug reports about it.
The solution, I think, is to add a checkpoint call so that grub can
operate with greater confidence that the bootloader stage2 will be able
to find the files it just wrote to the filesystem. Previous iterations
on this complaint suggested FIFREEZE/FITHAW, which was proven not to
work because we cannot guarantee the ability to stop the world for a
freeze, and grub only requires that the effects of previous system calls
can be found with a norecovery mount.
IOWS: I really like this new checkpointing ioctl! If we can wire this
up in the five major /boot filesystems (ext*, XFS, btrfs, and vfat) then
we can finally tell the grub developers to we have a real solution for
them. :)
> #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
>
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index a2cf35066f46..1d3636c1de3b 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> return -EOPNOTSUPP;
> return fsverity_ioctl_read_metadata(filp,
> (const void __user *)arg);
> + case EXT4_FLUSH_JOURNAL:
> + {
> + int discard = 0, err = 0;
> +
> + /* file argument is not the mount point */
> + if (file_dentry(filp) != sb->s_root)
> + return -EINVAL;
> +
> + /* filesystem is not backed by block device */
> + if (sb->s_bdev == NULL)
> + return -EINVAL;o
Could it be a problem that unprivileged programs can pound on a
heavyweight ioctl? The other callers of jbd2_journal_flush imply pretty
heavily that checkpointing is expensive and that we don't really expect
users to be able to induce a checkpoint.
> +
> + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> + return -EFAULT;
Please use an explicit struct with extra padding for future expansion,
because the history of ext4 ioctls taking integer pointer arguments is a
mess[1].
struct ioc_checkpoint_journal {
u64 flags;
u64 pad[3]; /* must be zero */
};
The GETFLAGS and SETFLAGS ioctls are defined to take a pointer to a
signed long, but the implementations use an unsigned int. Nobody
noticed the potential for memory corruption in the calling processes
until well after we moved to 64-bit. Hopefully we can avoid a repeat of
that by using explicitly sized types and named structs that are a little
more obvious to readers.
[1] https://lore.kernel.org/linux-fsdevel/[email protected]/
> +
> + if (EXT4_SB(sb)->s_journal) {
> + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> +
> + if (discard)
There's only a single bit of information here, so please only use one
bit, and leave the other 63/31 bits for future expansion.
Also, um, is there a manpage to document this new call? Or an fstest
to check its operation? I would very much like to port this to XFS, but
we need artifacts and the ability to show that it works.
--D
> + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> + else
> + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> +
> + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> + }
> + return err;
> + }
>
> default:
> return -ENOTTY;
> @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> case EXT4_IOC_GET_ES_CACHE:
> case FS_IOC_FSGETXATTR:
> case FS_IOC_FSSETXATTR:
> + case EXT4_FLUSH_JOURNAL:
> break;
> default:
> return -ENOIOCTLCMD;
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 2dc944442802..9718512e7178 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> EXPORT_SYMBOL(jbd2_journal_forget);
> EXPORT_SYMBOL(jbd2_journal_flush);
> +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> EXPORT_SYMBOL(jbd2_journal_revoke);
>
> EXPORT_SYMBOL(jbd2_journal_init_dev);
> @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> write_unlock(&journal->j_state_lock);
> }
>
> +/* discard journal blocks excluding journal superblock */
> +static int __jbd2_journal_issue_discard(journal_t *journal)
> +{
> + int err = 0;
> + unsigned long block, log_offset;
> + unsigned long long phys_block, block_start, block_stop;
> + loff_t byte_start, byte_stop, byte_count;
> + struct request_queue *q = bdev_get_queue(journal->j_dev);
> +
> + if (!q)
> + return -ENXIO;
> +
> + if (!blk_queue_discard(q))
> + return -EOPNOTSUPP;
> +
> + /* lookup block mapping and issue discard for each contiguous region */
> + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> +
> + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> + return err;
> + }
> +
> + /*
> + * use block_start - 1 to meet check for contiguous with previous region:
> + * phys_block == block_stop + 1
> + */
> + block_stop = block_start - 1;
> +
> + for (block = log_offset; block < journal->j_total_len; block++) {
> + err = jbd2_journal_bmap(journal, block, &phys_block);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> + return err;
> + }
> +
> + /*
> + * if block is last block, update stopping point
> + * if not last block and
> + * block is contiguous with previous block, continue
> + */
> + if (block == journal->j_total_len - 1)
> + block_stop = phys_block;
> + else if (phys_block == block_stop + 1) {
> + block_stop++;
> + continue;
> + }
> +
> + /*
> + * if not contiguous with prior physical block or this is last
> + * block of journal, take care of the region
> + */
> + byte_start = block_start * journal->j_blocksize;
> + byte_stop = block_stop * journal->j_blocksize;
> + byte_count = (block_stop - block_start + 1) *
> + journal->j_blocksize;
> +
> + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> + byte_start, byte_stop);
> +
> + /*
> + * use blkdev_issue_discard instead of sb_issue_discard
> + * because superblock not yet populated when this is
> + * called during journal_load during mount process
> + */
> + err = blkdev_issue_discard(journal->j_dev,
> + byte_start >> SECTOR_SHIFT,
> + byte_count >> SECTOR_SHIFT,
> + GFP_NOFS, 0);
> +
> + if (unlikely(err != 0)) {
> + printk(KERN_ERR "JBD2: unable to discard "
> + "journal at physical blocks %llu - %llu",
> + block_start, block_stop);
> + return err;
> + }
> +
> + block_start = phys_block;
> + block_stop = phys_block;
> + }
> +
> + return blkdev_issue_flush(journal->j_dev);
> +}
>
> /**
> * jbd2_journal_update_sb_errno() - Update error in the journal.
> @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> {
> int err;
> journal_superblock_t *sb;
> + struct request_queue *q = bdev_get_queue(journal->j_dev);
>
> err = load_superblock(journal);
> if (err)
> @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> */
> journal->j_flags &= ~JBD2_ABORT;
>
> + /* if journal device supports discard, discard journal blocks */
> + if (q && blk_queue_discard(q)) {
> + if (__jbd2_journal_issue_discard(journal))
> + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> + }
> +
> /* OK, we've finished with the dynamic journal bits:
> * reinitialise the dynamic contents of the superblock in memory
> * and reset them on disk. */
> @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> EXPORT_SYMBOL(jbd2_journal_clear_features);
>
> /**
> - * jbd2_journal_flush() - Flush journal
> + * __jbd2_journal_flush() - Flush journal
> * @journal: Journal to act on.
> + * @discard: flag (see below)
> *
> * Flush all data for a given journal to disk and empty the journal.
> * Filesystems can use this when remounting readonly to ensure that
> * recovery does not need to happen on remount.
> + *
> + * If 'discard' is false, the journal is simply flushed. If discard is true,
> + * the journal is also discarded.
> */
> -
> -int jbd2_journal_flush(journal_t *journal)
> +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> {
> int err = 0;
> transaction_t *transaction = NULL;
> @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> * commits of data to the journal will restore the current
> * s_start value. */
> jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> +
> + if (discard)
> + err = __jbd2_journal_issue_discard(journal);
> +
> mutex_unlock(&journal->j_checkpoint_mutex);
> write_lock(&journal->j_state_lock);
> J_ASSERT(!journal->j_running_transaction);
> @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> return err;
> }
>
> +int jbd2_journal_flush(journal_t *journal)
> +{
> + return __jbd2_journal_flush(journal, false /* don't discard */);
> +}
> +
> +/* flush journal and discard journal log */
> +int jbd2_journal_flush_and_discard(journal_t *journal)
> +{
> + return __jbd2_journal_flush(journal, true /* also discard */);
> +}
> +
> /**
> * jbd2_journal_wipe() - Wipe journal contents
> * @journal: Journal to act on.
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 99d3cd051ac3..9bed34e9a273 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> extern int jbd2_journal_stop(handle_t *);
> extern int jbd2_journal_flush (journal_t *);
> +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> extern void jbd2_journal_lock_updates (journal_t *);
> extern void jbd2_journal_unlock_updates (journal_t *);
>
> --
> 2.31.0.291.g576ba9dcdaf-goog
>
On Tue, Mar 30, 2021 at 10:17:34AM -0700, Darrick J. Wong wrote:
> On Thu, Mar 25, 2021 at 06:12:20PM +0000, Leah Rumancik wrote:
> > ioctl EXT4_FLUSH_JOURNAL flushes the journal log with the option to
> > discard journal log blocks. With the filename wipeout patch, if the discard
> > mount option is set, Ext4 guarantees that all data will be discarded on
> > deletion. This ioctl allows for periodically discarding journal contents
> > too.
> >
> > Also, add journal discard (if discard supported) during journal load
> > after recovery. This provides a potential solution to
> > https://lore.kernel.org/linux-ext4/[email protected]/ for
> > disks that support discard. After a successful journal recovery, e2fsck can
> > call this ioctl to discard the journal as well.
>
> Ok, round 2, this time from the perspective of a adding a journal
> checkpointing ioctl so that we can finally fix grub stupidity.
>
> > Signed-off-by: Leah Rumancik <[email protected]>
> > ---
> > fs/ext4/ext4.h | 1 +
> > fs/ext4/ioctl.c | 28 +++++++++++
> > fs/jbd2/journal.c | 116 +++++++++++++++++++++++++++++++++++++++++--
> > include/linux/jbd2.h | 1 +
> > 4 files changed, 143 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 8011418176bc..92c039ebcba7 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -724,6 +724,7 @@ enum {
> > #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
> > #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
> > #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
> > +#define EXT4_FLUSH_JOURNAL _IOW('f', 43, int)
>
> This really ought to be named "CHECKPOINT", not "FLUSH", because
> flushing only requires persisting to stable storage somewhere. This
> call does a lot more work than that, so its name ought to reflect the
> fact that it checkpoints the filesystem to clean the journal and then
> trims the journal blocks.
Yes, sure. I will rename it.
>
> The grub bootloader has had a serious design flaw ever since it
> introduced the ext4 and xfs drivers -- it ignores the journal when it's
> reading a filesystem, which means that unrecovered transactions in the
> journal are ignored. We (XFS anyway) /really/ don't want grub's
> diminutive filesystem drivers trying to implement recovery.
>
> Because of this inadequacy, we get sporadic complaints about grub
> failing to recognize new kernel files if the system goes down
> immediately after the package manager installs a new kernel, even if it
> succeeds in syncfs()ing /boot afterwards. The cause, of course, is that
> we /did/ flush the directory updates to disk, but they're in the journal
> and the journal didn't checkpoint before the system went down.
>
> XFS is far worse off in this category because we only tend to checkpoint
> the log when the head approaches the tail; iirc ext4/jbd2 tend to
> checkpoint frequently enough that I get fewer bug reports about it.
>
> The solution, I think, is to add a checkpoint call so that grub can
> operate with greater confidence that the bootloader stage2 will be able
> to find the files it just wrote to the filesystem. Previous iterations
> on this complaint suggested FIFREEZE/FITHAW, which was proven not to
> work because we cannot guarantee the ability to stop the world for a
> freeze, and grub only requires that the effects of previous system calls
> can be found with a norecovery mount.
>
> IOWS: I really like this new checkpointing ioctl! If we can wire this
> up in the five major /boot filesystems (ext*, XFS, btrfs, and vfat) then
> we can finally tell the grub developers to we have a real solution for
> them. :)
Sounds good to me!
>
> > #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
> >
> > diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> > index a2cf35066f46..1d3636c1de3b 100644
> > --- a/fs/ext4/ioctl.c
> > +++ b/fs/ext4/ioctl.c
> > @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > return -EOPNOTSUPP;
> > return fsverity_ioctl_read_metadata(filp,
> > (const void __user *)arg);
> > + case EXT4_FLUSH_JOURNAL:
> > + {
> > + int discard = 0, err = 0;
> > +
> > + /* file argument is not the mount point */
> > + if (file_dentry(filp) != sb->s_root)
> > + return -EINVAL;
> > +
> > + /* filesystem is not backed by block device */
> > + if (sb->s_bdev == NULL)
> > + return -EINVAL;o
>
> Could it be a problem that unprivileged programs can pound on a
> heavyweight ioctl? The other callers of jbd2_journal_flush imply pretty
> heavily that checkpointing is expensive and that we don't really expect
> users to be able to induce a checkpoint.
Hmm yeah that's a good point. The original reasoning for not requiring
root privileges was so it could be used even if a filesystem was created
without root privileges. But that's not the common case and considering
the expensivness, I think requiring root privileges makes sense.
>
> > +
> > + if (copy_from_user(&discard, (int __user *)arg, sizeof(int)))
> > + return -EFAULT;
>
> Please use an explicit struct with extra padding for future expansion,
> because the history of ext4 ioctls taking integer pointer arguments is a
> mess[1].
>
> struct ioc_checkpoint_journal {
> u64 flags;
> u64 pad[3]; /* must be zero */
> };
>
> The GETFLAGS and SETFLAGS ioctls are defined to take a pointer to a
> signed long, but the implementations use an unsigned int. Nobody
> noticed the potential for memory corruption in the calling processes
> until well after we moved to 64-bit. Hopefully we can avoid a repeat of
> that by using explicitly sized types and named structs that are a little
> more obvious to readers.
>
> [1] https://lore.kernel.org/linux-fsdevel/[email protected]/
>
> > +
> > + if (EXT4_SB(sb)->s_journal) {
> > + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> > +
> > + if (discard)
>
> There's only a single bit of information here, so please only use one
> bit, and leave the other 63/31 bits for future expansion.
>
Yes, sure. Thanks for the suggestions/references.
> Also, um, is there a manpage to document this new call? Or an fstest
> to check its operation? I would very much like to port this to XFS, but
> we need artifacts and the ability to show that it works.
There isn't a manpage yet, but I'd be happy to add one. I also plan on
submitting an fstest.
>
> --D
>
> > + err = jbd2_journal_flush_and_discard(EXT4_SB(sb)->s_journal);
> > + else
> > + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> > +
> > + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> > + }
> > + return err;
> > + }
> >
> > default:
> > return -ENOTTY;
> > @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> > case EXT4_IOC_GET_ES_CACHE:
> > case FS_IOC_FSGETXATTR:
> > case FS_IOC_FSSETXATTR:
> > + case EXT4_FLUSH_JOURNAL:
> > break;
> > default:
> > return -ENOIOCTLCMD;
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index 2dc944442802..9718512e7178 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -67,6 +67,7 @@ EXPORT_SYMBOL(jbd2_journal_set_triggers);
> > EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
> > EXPORT_SYMBOL(jbd2_journal_forget);
> > EXPORT_SYMBOL(jbd2_journal_flush);
> > +EXPORT_SYMBOL(jbd2_journal_flush_and_discard);
> > EXPORT_SYMBOL(jbd2_journal_revoke);
> >
> > EXPORT_SYMBOL(jbd2_journal_init_dev);
> > @@ -1686,6 +1687,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > write_unlock(&journal->j_state_lock);
> > }
> >
> > +/* discard journal blocks excluding journal superblock */
> > +static int __jbd2_journal_issue_discard(journal_t *journal)
> > +{
> > + int err = 0;
> > + unsigned long block, log_offset;
> > + unsigned long long phys_block, block_start, block_stop;
> > + loff_t byte_start, byte_stop, byte_count;
> > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> > +
> > + if (!q)
> > + return -ENXIO;
> > +
> > + if (!blk_queue_discard(q))
> > + return -EOPNOTSUPP;
> > +
> > + /* lookup block mapping and issue discard for each contiguous region */
> > + log_offset = be32_to_cpu(journal->j_superblock->s_first);
> > +
> > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > + return err;
> > + }
> > +
> > + /*
> > + * use block_start - 1 to meet check for contiguous with previous region:
> > + * phys_block == block_stop + 1
> > + */
> > + block_stop = block_start - 1;
> > +
> > + for (block = log_offset; block < journal->j_total_len; block++) {
> > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > + return err;
> > + }
> > +
> > + /*
> > + * if block is last block, update stopping point
> > + * if not last block and
> > + * block is contiguous with previous block, continue
> > + */
> > + if (block == journal->j_total_len - 1)
> > + block_stop = phys_block;
> > + else if (phys_block == block_stop + 1) {
> > + block_stop++;
> > + continue;
> > + }
> > +
> > + /*
> > + * if not contiguous with prior physical block or this is last
> > + * block of journal, take care of the region
> > + */
> > + byte_start = block_start * journal->j_blocksize;
> > + byte_stop = block_stop * journal->j_blocksize;
> > + byte_count = (block_stop - block_start + 1) *
> > + journal->j_blocksize;
> > +
> > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > + byte_start, byte_stop);
> > +
> > + /*
> > + * use blkdev_issue_discard instead of sb_issue_discard
> > + * because superblock not yet populated when this is
> > + * called during journal_load during mount process
> > + */
> > + err = blkdev_issue_discard(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > +
> > + if (unlikely(err != 0)) {
> > + printk(KERN_ERR "JBD2: unable to discard "
> > + "journal at physical blocks %llu - %llu",
> > + block_start, block_stop);
> > + return err;
> > + }
> > +
> > + block_start = phys_block;
> > + block_stop = phys_block;
> > + }
> > +
> > + return blkdev_issue_flush(journal->j_dev);
> > +}
> >
> > /**
> > * jbd2_journal_update_sb_errno() - Update error in the journal.
> > @@ -1892,6 +1977,7 @@ int jbd2_journal_load(journal_t *journal)
> > {
> > int err;
> > journal_superblock_t *sb;
> > + struct request_queue *q = bdev_get_queue(journal->j_dev);
> >
> > err = load_superblock(journal);
> > if (err)
> > @@ -1936,6 +2022,12 @@ int jbd2_journal_load(journal_t *journal)
> > */
> > journal->j_flags &= ~JBD2_ABORT;
> >
> > + /* if journal device supports discard, discard journal blocks */
> > + if (q && blk_queue_discard(q)) {
> > + if (__jbd2_journal_issue_discard(journal))
> > + printk(KERN_ERR "JBD2: failed to discard journal when loading");
> > + }
> > +
> > /* OK, we've finished with the dynamic journal bits:
> > * reinitialise the dynamic contents of the superblock in memory
> > * and reset them on disk. */
> > @@ -2244,15 +2336,18 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
> > EXPORT_SYMBOL(jbd2_journal_clear_features);
> >
> > /**
> > - * jbd2_journal_flush() - Flush journal
> > + * __jbd2_journal_flush() - Flush journal
> > * @journal: Journal to act on.
> > + * @discard: flag (see below)
> > *
> > * Flush all data for a given journal to disk and empty the journal.
> > * Filesystems can use this when remounting readonly to ensure that
> > * recovery does not need to happen on remount.
> > + *
> > + * If 'discard' is false, the journal is simply flushed. If discard is true,
> > + * the journal is also discarded.
> > */
> > -
> > -int jbd2_journal_flush(journal_t *journal)
> > +static int __jbd2_journal_flush(journal_t *journal, bool discard)
> > {
> > int err = 0;
> > transaction_t *transaction = NULL;
> > @@ -2306,6 +2401,10 @@ int jbd2_journal_flush(journal_t *journal)
> > * commits of data to the journal will restore the current
> > * s_start value. */
> > jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> > +
> > + if (discard)
> > + err = __jbd2_journal_issue_discard(journal);
> > +
> > mutex_unlock(&journal->j_checkpoint_mutex);
> > write_lock(&journal->j_state_lock);
> > J_ASSERT(!journal->j_running_transaction);
> > @@ -2318,6 +2417,17 @@ int jbd2_journal_flush(journal_t *journal)
> > return err;
> > }
> >
> > +int jbd2_journal_flush(journal_t *journal)
> > +{
> > + return __jbd2_journal_flush(journal, false /* don't discard */);
> > +}
> > +
> > +/* flush journal and discard journal log */
> > +int jbd2_journal_flush_and_discard(journal_t *journal)
> > +{
> > + return __jbd2_journal_flush(journal, true /* also discard */);
> > +}
> > +
> > /**
> > * jbd2_journal_wipe() - Wipe journal contents
> > * @journal: Journal to act on.
> > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > index 99d3cd051ac3..9bed34e9a273 100644
> > --- a/include/linux/jbd2.h
> > +++ b/include/linux/jbd2.h
> > @@ -1492,6 +1492,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
> > extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
> > extern int jbd2_journal_stop(handle_t *);
> > extern int jbd2_journal_flush (journal_t *);
> > +extern int jbd2_journal_flush_and_discard(journal_t *journal);
> > extern void jbd2_journal_lock_updates (journal_t *);
> > extern void jbd2_journal_unlock_updates (journal_t *);
> >
> > --
> > 2.31.0.291.g576ba9dcdaf-goog
> >