Add a flags argument to jbd2_journal_flush to enable discarding or
zero-filling the journal blocks while flushing the journal.
Signed-off-by: Leah Rumancik <[email protected]>
Changes in v4:
- restructured code division between patches
- changed jbd2_journal_flush flags arg from bool to unsigned long long
---
fs/ext4/inode.c | 4 +-
fs/ext4/ioctl.c | 6 +--
fs/ext4/super.c | 6 +--
fs/jbd2/journal.c | 112 ++++++++++++++++++++++++++++++++++++++++++-
fs/ocfs2/alloc.c | 2 +-
fs/ocfs2/journal.c | 8 ++--
include/linux/jbd2.h | 2 +-
7 files changed, 124 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..f44800361a38 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
jbd2_journal_unlock_updates(journal);
if (err)
@@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
if (val)
ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else {
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
percpu_up_write(&sbi->s_writepages_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 31627f7dc5cd..898705fc8d36 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -707,7 +707,7 @@ static long ext4_ioctl_group_add(struct file *file,
err = ext4_group_add(sb, input);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -885,7 +885,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
@@ -1028,7 +1028,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (EXT4_SB(sb)->s_journal) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
}
if (err == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7dc94f3e18e6..76c3bdb7e61d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5639,7 +5639,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
return 0;
}
jbd2_journal_lock_updates(journal);
- err = jbd2_journal_flush(journal);
+ err = jbd2_journal_flush(journal, 0);
if (err < 0)
goto out;
@@ -5781,7 +5781,7 @@ static int ext4_freeze(struct super_block *sb)
* Don't clear the needs_recovery flag if we failed to
* flush the journal.
*/
- error = jbd2_journal_flush(journal);
+ error = jbd2_journal_flush(journal, 0);
if (error < 0)
goto out;
@@ -6376,7 +6376,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* otherwise be livelocked...
*/
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
if (err)
return err;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2dc944442802..f86929dbca3c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1686,6 +1686,106 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
write_unlock(&journal->j_state_lock);
}
+#define JBD2_ERASE_FLAG_DISCARD 1
+#define JBD2_ERASE_FLAG_ZEROOUT 2
+
+/**
+ * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock)
+ * @journal: The journal to erase.
+ * @flags: A discard/zeroout request is sent for each physically contigous region
+ * of the journal. Either JBD2_ERASE_FLAG_DISCARD or JBD2_ERASE_FLAG_ZEROOUT
+ * must be set to decide which operation to perform.
+ *
+ * Note: JBD2_ERASE_FLAG_ZEROOUT attempts to use hardware offload. Zeroes will
+ * be explicitly written if no hardware offload is available, see
+ * blkdev_issue_zeroout for more details.
+ */
+static int __jbd2_journal_erase(journal_t *journal, unsigned long long flags)
+{
+ int err = 0;
+ unsigned long block, log_offset; /* logical */
+ unsigned long long phys_block, block_start, block_stop; /* physical */
+ loff_t byte_start, byte_stop, byte_count;
+ struct request_queue *q = bdev_get_queue(journal->j_dev);
+
+ /* flags must be set to either discard or zeroout */
+ if ((flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT) || !flags)
+ return -EINVAL;
+
+ if (!q)
+ return -ENXIO;
+
+ if (JBD2_ERASE_FLAG_DISCARD & !blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ /*
+ * lookup block mapping and issue discard/zeroout for each
+ * contiguous region
+ */
+ log_offset = be32_to_cpu(journal->j_superblock->s_first);
+
+ err = jbd2_journal_bmap(journal, log_offset, &block_start);
+ if (err) {
+ printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
+ return err;
+ }
+
+ /*
+ * use block_start - 1 to meet check for contiguous with previous region:
+ * phys_block == block_stop + 1
+ */
+ block_stop = block_start - 1;
+
+ for (block = log_offset; block < journal->j_total_len; block++) {
+ err = jbd2_journal_bmap(journal, block, &phys_block);
+ if (err) {
+ printk(KERN_ERR "JBD2: bad block at offset %lu", block);
+ return err;
+ }
+
+ if (block == journal->j_total_len - 1) {
+ block_stop = phys_block;
+ } else if (phys_block == block_stop + 1) {
+ block_stop++;
+ continue;
+ }
+
+ /*
+ * not contiguous with prior physical block or this is last
+ * block of journal, take care of the region
+ */
+ byte_start = block_start * journal->j_blocksize;
+ byte_stop = block_stop * journal->j_blocksize;
+ byte_count = (block_stop - block_start + 1) *
+ journal->j_blocksize;
+
+ truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+ byte_start, byte_stop);
+
+ if (flags & JBD2_ERASE_FLAG_DISCARD) {
+ err = blkdev_issue_discard(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+ } else if (flags & JBD2_ERASE_FLAG_ZEROOUT) {
+ err = blkdev_issue_zeroout(journal->j_dev,
+ byte_start >> SECTOR_SHIFT,
+ byte_count >> SECTOR_SHIFT,
+ GFP_NOFS, 0);
+ }
+
+ if (unlikely(err != 0)) {
+ printk(KERN_ERR "JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+ err, block_start, block_stop);
+ return err;
+ }
+
+ block_start = phys_block;
+ block_stop = phys_block;
+ }
+
+ return blkdev_issue_flush(journal->j_dev);
+}
/**
* jbd2_journal_update_sb_errno() - Update error in the journal.
@@ -2246,13 +2346,17 @@ EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
* jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
+ * @flags: optional operations on the journal blocks after the flush (see below)
*
* Flush all data for a given journal to disk and empty the journal.
* Filesystems can use this when remounting readonly to ensure that
* recovery does not need to happen on remount.
+ *
+ * flags:
+ * EXT4_IOC_CHECKPOINT_FLAG_DISCARD: issues discards for the journal blocks
+ * EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT: issues zeroouts for the journal blocks
*/
-
-int jbd2_journal_flush(journal_t *journal)
+int jbd2_journal_flush(journal_t *journal, unsigned long long flags)
{
int err = 0;
transaction_t *transaction = NULL;
@@ -2306,6 +2410,10 @@ int jbd2_journal_flush(journal_t *journal)
* commits of data to the journal will restore the current
* s_start value. */
jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
+
+ if (flags)
+ err = __jbd2_journal_erase(journal, flags);
+
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 78710788c237..5ff2c42cb46c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6020,7 +6020,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
* Then truncate log will be replayed resulting in cluster double free.
*/
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, false);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index db52e843002a..1c356b29c66d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -310,7 +310,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
}
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, false);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
up_write(&journal->j_trans_barrier);
@@ -1002,7 +1002,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (ocfs2_mount_local(osb)) {
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, false);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
@@ -1072,7 +1072,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
if (replayed) {
jbd2_journal_lock_updates(journal->j_journal);
- status = jbd2_journal_flush(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal, false);
jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
@@ -1668,7 +1668,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* wipe the journal */
jbd2_journal_lock_updates(journal);
- status = jbd2_journal_flush(journal);
+ status = jbd2_journal_flush(journal, false);
jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index db0e1920cb12..580d4e6a2f54 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1500,7 +1500,7 @@ extern int jbd2_journal_invalidatepage(journal_t *,
struct page *, unsigned int, unsigned int);
extern int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
extern int jbd2_journal_stop(handle_t *);
-extern int jbd2_journal_flush (journal_t *);
+extern int jbd2_journal_flush(journal_t *journal, unsigned long long flags);
extern void jbd2_journal_lock_updates (journal_t *);
extern void jbd2_journal_unlock_updates (journal_t *);
--
2.31.1.607.g51e8a6a459-goog
ioctl EXT4_IOC_CHECKPOINT checkpoints and flushes the journal. This
includes forcing all the transactions to the log, checkpointing the
transactions, and flushing the log to disk. This ioctl takes u64 "flags"
as an argument. Three flags are supported. EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
returns success immediately upon entering the ioctl, without performing
any checkpointing. This can be used to check whether the ioctl exists on a
system. The other two flags, EXT4_IOC_CHECKPOINT_FLAG_DISCARD and
EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT, can be used to issue requests to
discard and zeroout the journal logs blocks, respectively. At this
point, EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT is primarily added to enable
testing of this codepath on devices that don't support discard.
EXT4_IOC_CHECKPOINT_FLAG_DISCARD and EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
cannot both be set.
Systems that wish to achieve content deletion SLO can set up a daemon
that calls this ioctl at a regular interval such that it matches with the
SLO requirement. Thus, with this patch, the ext4_dir_entry2 wipeout
patch[1], and the Ext4 "-o discard" mount option set, Ext4 can now
guarantee that all file contents, file metatdata, and filenames will not
be accessible through the filesystem and will have had discard or
zeroout requests issued for corresponding device blocks.
The __jbd2_journal_erase function could also be used to discard or
zero-fill the journal during journal load after recovery. This would
provide a potential solution to a journal replay bug reported earlier this
year[2]. After a successful journal recovery, e2fsck can call this ioctl to
discard the journal as well.
[1] https://lore.kernel.org/linux-ext4/[email protected]/
[2] https://lore.kernel.org/linux-ext4/[email protected]/
Signed-off-by: Leah Rumancik <[email protected]>
Changes in v4:
- update commit description
- update error codes
- update code formatting
- add flag EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN
- add flag EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT
---
fs/ext4/ext4.h | 6 ++++++
fs/ext4/ioctl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 37002663d521..41a5c6a83586 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -720,6 +720,7 @@ enum {
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
+#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u64)
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
@@ -741,6 +742,11 @@ enum {
#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
+/* flags for ioctl EXT4_IOC_CHECKPOINT */
+#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1
+#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2
+#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 898705fc8d36..fa79abd0415c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -800,6 +800,53 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
return error;
}
+static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
+{
+ int err = 0;
+ unsigned long long flags = 0;
+ struct super_block *sb = file_inode(filp)->i_sb;
+
+ if (copy_from_user(&flags, (__u64 __user *)arg,
+ sizeof(__u64)))
+ return -EFAULT;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
+ return 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* file argument is not the mount point */
+ if (file_dentry(filp) != sb->s_root)
+ return -EINVAL;
+
+ /* filesystem is not backed by block device */
+ if (sb->s_bdev == NULL)
+ return -ENODEV;
+
+ /* check for invalid bits set */
+ if (flags & ~(EXT4_IOC_CHECKPOINT_FLAG_DISCARD |
+ EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT))
+ return -EINVAL;
+
+ /* both discard and zeroout cannot be set */
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD &
+ EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT)
+ return -EINVAL;
+
+ if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT)
+ pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
+
+ if (!EXT4_SB(sb)->s_journal)
+ return -ENODEV;
+
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flags);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1211,6 +1258,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return fsverity_ioctl_read_metadata(filp,
(const void __user *)arg);
+ case EXT4_IOC_CHECKPOINT:
+ return ext4_ioctl_checkpoint(filp, arg);
+
default:
return -ENOTTY;
}
@@ -1291,6 +1341,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
+ case EXT4_IOC_CHECKPOINT:
break;
default:
return -ENOIOCTLCMD;
--
2.31.1.607.g51e8a6a459-goog
On Tue, May 11, 2021 at 06:04:26PM +0000, Leah Rumancik wrote:
> @@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
> ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
> journal = EXT4_JOURNAL(inode);
> jbd2_journal_lock_updates(journal);
> - err = jbd2_journal_flush(journal);
> + err = jbd2_journal_flush(journal, 0);
In the ocfs2 changes, I noticed you are using "false", instead of 0,
in the second argument to jbd2_journal_flush.
When I looked more closely, the function signature of
jbd2_journal_flush is also using an unsigned long long for flags,
which struck me as strange:
> +extern int jbd2_journal_flush(journal_t *journal, unsigned long long flags);
I then noticed that later in the patch series, the ioctl argument is
taking an unsigned long long and we're passing that straight through
to jbd2_journal_flush().
First of all, unsigned long long is not very efficient on many
platforms (especially 32-bit platforms), but also on platforms where
int is 32 bits. If we don't expect us to need more than 32 flag bits,
I'd suggest explicit ly using __u32 in ioctl interface. (__u32 is
fine; it's the use of the base int type which can get us into trouble,
since int can be either 32 or 64 bits depending on the architecture).
Secondly, I'd suggest using a different set of flags for
jbd2_journal_flush(), which is an internal kernel interface, and the
EXT4_IOC_CHECKPOINT interface. We might in the future want to add
some internal flags to jbd2_journal_flush that we do *not* want to
expose via EXT4_IOC_CHECKPOINT, and so it's best that we keep those
two interfaces separate.
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 2dc944442802..f86929dbca3c 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1686,6 +1686,106 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> write_unlock(&journal->j_state_lock);
> }
>
> +#define JBD2_ERASE_FLAG_DISCARD 1
> +#define JBD2_ERASE_FLAG_ZEROOUT 2
I'd suggest defining these in include/linux/jbd2.h, and giving them
names like: JBD2_JOURNAL_FLUSH_DISCARD and JBD2_JOURNAL_FLUSH_ERASE...
(and making the flags parameter an unsigned int).
> + /* flags must be set to either discard or zeroout */
> + if ((flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT) || !flags)
> + return -EINVAL;
The expression (flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT)
is always going to evaluate to zero, since (1 & 2) is 0.
What you probably want is something like:
#define JBD2_JOURNAL_FLUSH_DISCARD 0x0001
#define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002
#define JBD2_JOURNAL_FLUSH_VALID 0x0003
if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) ||
((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
(flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
return -EINVAL;
> +
> + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> + return err;
> + }
We could get rid of this, and instead make sure block_start is initialized
to ~((unsigned long long) 0). Then in the loop we can do...
> +
> + /*
> + * use block_start - 1 to meet check for contiguous with previous region:
> + * phys_block == block_stop + 1
> + */
> + block_stop = block_start - 1;
> +
> + for (block = log_offset; block < journal->j_total_len; block++) {
> + err = jbd2_journal_bmap(journal, block, &phys_block);
> + if (err) {
> + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> + return err;
> + }
if (block_start == ~((unsigned long long) 0)) {
block_start = phys_block;
block_Stop = block_start - 1;
}
> +
> + if (block == journal->j_total_len - 1) {
> + block_stop = phys_block;
> + } else if (phys_block == block_stop + 1) {
> + block_stop++;
> + continue;
> + }
> +
> + /*
> + * not contiguous with prior physical block or this is last
> + * block of journal, take care of the region
> + */
> + byte_start = block_start * journal->j_blocksize;
> + byte_stop = block_stop * journal->j_blocksize;
> + byte_count = (block_stop - block_start + 1) *
> + journal->j_blocksize;
> +
> + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> + byte_start, byte_stop);
> +
> + if (flags & JBD2_ERASE_FLAG_DISCARD) {
> + err = blkdev_issue_discard(journal->j_dev,
> + byte_start >> SECTOR_SHIFT,
> + byte_count >> SECTOR_SHIFT,
> + GFP_NOFS, 0);
> + } else if (flags & JBD2_ERASE_FLAG_ZEROOUT) {
> + err = blkdev_issue_zeroout(journal->j_dev,
> + byte_start >> SECTOR_SHIFT,
> + byte_count >> SECTOR_SHIFT,
> + GFP_NOFS, 0);
> + }
> +
> + if (unlikely(err != 0)) {
> + printk(KERN_ERR "JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
> + err, block_start, block_stop);
> + return err;
> + }
> +
> + block_start = phys_block;
> + block_stop = phys_block;
Is this right? When we initialized the loop, above, block_stop was
set to block_start-1 (where block_start == phys_block). So I think it
might be more correct to replace the above two lines with:
block_start = ~((unsigned long long) 0);
... and then let block_start and block_stop be initialized in a single
place. Do you agree? Does this make sense to you?
- Ted
On Thu, May 13, 2021 at 02:09:26PM -0400, Theodore Ts'o wrote:
> On Tue, May 11, 2021 at 06:04:26PM +0000, Leah Rumancik wrote:
> > @@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
> > ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
> > journal = EXT4_JOURNAL(inode);
> > jbd2_journal_lock_updates(journal);
> > - err = jbd2_journal_flush(journal);
> > + err = jbd2_journal_flush(journal, 0);
>
> In the ocfs2 changes, I noticed you are using "false", instead of 0,
> in the second argument to jbd2_journal_flush.
>
> When I looked more closely, the function signature of
> jbd2_journal_flush is also using an unsigned long long for flags,
> which struck me as strange:
>
> > +extern int jbd2_journal_flush(journal_t *journal, unsigned long long flags);
>
> I then noticed that later in the patch series, the ioctl argument is
> taking an unsigned long long and we're passing that straight through
> to jbd2_journal_flush().
>
> First of all, unsigned long long is not very efficient on many
> platforms (especially 32-bit platforms), but also on platforms where
> int is 32 bits. If we don't expect us to need more than 32 flag bits,
> I'd suggest explicit ly using __u32 in ioctl interface. (__u32 is
> fine; it's the use of the base int type which can get us into trouble,
> since int can be either 32 or 64 bits depending on the architecture).
FWIW I had been advocating for u64 for the ioctl interface since that's
the hardest part to change; once we've gotten that into the kernel and
remapped the ioctl flags to jbd2 flags, you can do whatever you want.
> Secondly, I'd suggest using a different set of flags for
> jbd2_journal_flush(), which is an internal kernel interface, and the
> EXT4_IOC_CHECKPOINT interface. We might in the future want to add
> some internal flags to jbd2_journal_flush that we do *not* want to
> expose via EXT4_IOC_CHECKPOINT, and so it's best that we keep those
> two interfaces separate.
>
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index 2dc944442802..f86929dbca3c 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -1686,6 +1686,106 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > write_unlock(&journal->j_state_lock);
> > }
> >
> > +#define JBD2_ERASE_FLAG_DISCARD 1
> > +#define JBD2_ERASE_FLAG_ZEROOUT 2
>
> I'd suggest defining these in include/linux/jbd2.h, and giving them
> names like: JBD2_JOURNAL_FLUSH_DISCARD and JBD2_JOURNAL_FLUSH_ERASE...
> (and making the flags parameter an unsigned int).
>
> > + /* flags must be set to either discard or zeroout */
> > + if ((flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT) || !flags)
> > + return -EINVAL;
>
> The expression (flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT)
> is always going to evaluate to zero, since (1 & 2) is 0.
>
> What you probably want is something like:
>
> #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001
> #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002
> #define JBD2_JOURNAL_FLUSH_VALID 0x0003
>
> if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) ||
> ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
> (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
> return -EINVAL;
>
> > +
> > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > + return err;
> > + }
>
> We could get rid of this, and instead make sure block_start is initialized
> to ~((unsigned long long) 0). Then in the loop we can do...
Also FWIW I can't find the fiemap code that let you do fiemap from
within the kernel, so I guess we only talked about it on fsdevel and
none of it ever got merged. So I guess looping is what we'll have to do
for now...
--D
> > +
> > + /*
> > + * use block_start - 1 to meet check for contiguous with previous region:
> > + * phys_block == block_stop + 1
> > + */
> > + block_stop = block_start - 1;
> > +
> > + for (block = log_offset; block < journal->j_total_len; block++) {
> > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > + return err;
> > + }
>
> if (block_start == ~((unsigned long long) 0)) {
> block_start = phys_block;
> block_Stop = block_start - 1;
> }
>
> > +
> > + if (block == journal->j_total_len - 1) {
> > + block_stop = phys_block;
> > + } else if (phys_block == block_stop + 1) {
> > + block_stop++;
> > + continue;
> > + }
> > +
> > + /*
> > + * not contiguous with prior physical block or this is last
> > + * block of journal, take care of the region
> > + */
> > + byte_start = block_start * journal->j_blocksize;
> > + byte_stop = block_stop * journal->j_blocksize;
> > + byte_count = (block_stop - block_start + 1) *
> > + journal->j_blocksize;
> > +
> > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > + byte_start, byte_stop);
> > +
> > + if (flags & JBD2_ERASE_FLAG_DISCARD) {
> > + err = blkdev_issue_discard(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + } else if (flags & JBD2_ERASE_FLAG_ZEROOUT) {
> > + err = blkdev_issue_zeroout(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + }
> > +
> > + if (unlikely(err != 0)) {
> > + printk(KERN_ERR "JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
> > + err, block_start, block_stop);
> > + return err;
> > + }
> > +
> > + block_start = phys_block;
> > + block_stop = phys_block;
>
> Is this right? When we initialized the loop, above, block_stop was
> set to block_start-1 (where block_start == phys_block). So I think it
> might be more correct to replace the above two lines with:
>
> block_start = ~((unsigned long long) 0);
>
> ... and then let block_start and block_stop be initialized in a single
> place. Do you agree? Does this make sense to you?
>
> - Ted
On Thu, May 13, 2021 at 02:09:26PM -0400, Theodore Ts'o wrote:
> On Tue, May 11, 2021 at 06:04:26PM +0000, Leah Rumancik wrote:
> > @@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
> > ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
> > journal = EXT4_JOURNAL(inode);
> > jbd2_journal_lock_updates(journal);
> > - err = jbd2_journal_flush(journal);
> > + err = jbd2_journal_flush(journal, 0);
>
> In the ocfs2 changes, I noticed you are using "false", instead of 0,
> in the second argument to jbd2_journal_flush.
>
> When I looked more closely, the function signature of
> jbd2_journal_flush is also using an unsigned long long for flags,
> which struck me as strange:
>
> > +extern int jbd2_journal_flush(journal_t *journal, unsigned long long flags);
>
> I then noticed that later in the patch series, the ioctl argument is
> taking an unsigned long long and we're passing that straight through
> to jbd2_journal_flush().
>
> First of all, unsigned long long is not very efficient on many
> platforms (especially 32-bit platforms), but also on platforms where
> int is 32 bits. If we don't expect us to need more than 32 flag bits,
> I'd suggest explicit ly using __u32 in ioctl interface. (__u32 is
> fine; it's the use of the base int type which can get us into trouble,
> since int can be either 32 or 64 bits depending on the architecture).
>
Just to make sure I understand correctly, the explicit __u32 is critical
due to the size being read in by the ioctl, specifically through
copy_from_user? When do you switch from __u32 to unsigned long? I don't
see the __* types being carried throughout.
(Also, just got Darrick's reply about the 32 vs. 64. Yes, originally
went with 64 because there was an argument for it. I believe the 32
is likely sufficient, but I don't feel that strongly about this matter)
> Secondly, I'd suggest using a different set of flags for
> jbd2_journal_flush(), which is an internal kernel interface, and the
> EXT4_IOC_CHECKPOINT interface. We might in the future want to add
> some internal flags to jbd2_journal_flush that we do *not* want to
> expose via EXT4_IOC_CHECKPOINT, and so it's best that we keep those
> two interfaces separate.
>
> > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> > index 2dc944442802..f86929dbca3c 100644
> > --- a/fs/jbd2/journal.c
> > +++ b/fs/jbd2/journal.c
> > @@ -1686,6 +1686,106 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
> > write_unlock(&journal->j_state_lock);
> > }
> >
> > +#define JBD2_ERASE_FLAG_DISCARD 1
> > +#define JBD2_ERASE_FLAG_ZEROOUT 2
>
> I'd suggest defining these in include/linux/jbd2.h, and giving them
> names like: JBD2_JOURNAL_FLUSH_DISCARD and JBD2_JOURNAL_FLUSH_ERASE...
> (and making the flags parameter an unsigned int).
>
> > + /* flags must be set to either discard or zeroout */
> > + if ((flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT) || !flags)
> > + return -EINVAL;
>
> The expression (flags & JBD2_ERASE_FLAG_DISCARD & JBD2_ERASE_FLAG_ZEROOUT)
> is always going to evaluate to zero, since (1 & 2) is 0.
>
> What you probably want is something like:
>
> #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001
> #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002
> #define JBD2_JOURNAL_FLUSH_VALID 0x0003
>
Why call them JBD2_JOURNAL_FLUSH* instead of JBD2_JOURNAL_ERASE* since
they get passed directly through to the erase function? I feel like it
would be weird if someone wanted to use the erase function directly but
had to use JBD2_JOURNAL_FLUSH* flags.
> if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) ||
> ((flags & JBD2_JOURNAL_FLUSH_DISCARD) &&
> (flags & JBD2_JOURNAL_FLUSH_ZEROOUT)))
> return -EINVAL;
>
Ah, great. Thanks!
> > +
> > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > + return err;
> > + }
>
> We could get rid of this, and instead make sure block_start is initialized
> to ~((unsigned long long) 0). Then in the loop we can do...
>
> > +
> > + /*
> > + * use block_start - 1 to meet check for contiguous with previous region:
> > + * phys_block == block_stop + 1
> > + */
> > + block_stop = block_start - 1;
> > +
> > + for (block = log_offset; block < journal->j_total_len; block++) {
> > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > + return err;
> > + }
>
> if (block_start == ~((unsigned long long) 0)) {
> block_start = phys_block;
> block_Stop = block_start - 1;
> }
>
> > +
> > + if (block == journal->j_total_len - 1) {
> > + block_stop = phys_block;
> > + } else if (phys_block == block_stop + 1) {
> > + block_stop++;
> > + continue;
> > + }
> > +
> > + /*
> > + * not contiguous with prior physical block or this is last
> > + * block of journal, take care of the region
> > + */
> > + byte_start = block_start * journal->j_blocksize;
> > + byte_stop = block_stop * journal->j_blocksize;
> > + byte_count = (block_stop - block_start + 1) *
> > + journal->j_blocksize;
> > +
> > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > + byte_start, byte_stop);
> > +
> > + if (flags & JBD2_ERASE_FLAG_DISCARD) {
> > + err = blkdev_issue_discard(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + } else if (flags & JBD2_ERASE_FLAG_ZEROOUT) {
> > + err = blkdev_issue_zeroout(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + }
> > +
> > + if (unlikely(err != 0)) {
> > + printk(KERN_ERR "JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
> > + err, block_start, block_stop);
> > + return err;
> > + }
> > +
> > + block_start = phys_block;
> > + block_stop = phys_block;
>
> Is this right? When we initialized the loop, above, block_stop was
> set to block_start-1 (where block_start == phys_block). So I think it
> might be more correct to replace the above two lines with:
>
> block_start = ~((unsigned long long) 0);
>
I'll play around with this and see if I can get it to work. Seems like
it might simplify the code a bit.
> ... and then let block_start and block_stop be initialized in a single
> place. Do you agree? Does this make sense to you?
>
> - Ted
Thanks!
Leah
On Tue, May 11, 2021 at 06:04:27PM +0000, Leah Rumancik wrote:
> +static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg)
> +{
> + int err = 0;
> + unsigned long long flags = 0;
> + struct super_block *sb = file_inode(filp)->i_sb;
> +
> + if (copy_from_user(&flags, (__u64 __user *)arg,
> + sizeof(__u64)))
> + return -EFAULT;
> +
> + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN)
> + return 0;
We should document exactly what "Dry run" means. Right now, it looks
like it's used so we can tell whether the ioctl is support at all. It
might be better to do all of the checks first, so that if
EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN is set, and the ioctl returns
success, we know that all of the ioctl would succeed. This would
allow us to use DRY_RUN to check to see if a future flag bit is supported.
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + /* file argument is not the mount point */
> + if (file_dentry(filp) != sb->s_root)
> + return -EINVAL;
I'm not sure we need to require that EXT4_IOC_CHECKPOINT has to be
called with the mount point, especially given that only process with
root privs will be allowed to call the ioctl. SoI'd suggest removing
the check above, and allowing a file descriptor opened on any file or
directory on the file system to be sufficient to trigger the ioctl.
> + /* filesystem is not backed by block device */
> + if (sb->s_bdev == NULL)
> + return -ENODEV;
This should never be the case for ext4....
> +
> + /* check for invalid bits set */
> + if (flags & ~(EXT4_IOC_CHECKPOINT_FLAG_DISCARD |
> + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT))
> + return -EINVAL;
> +
> + /* both discard and zeroout cannot be set */
> + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD &
> + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT)
> + return -EINVAL;
This check isn't correct; see a similar comment that I made on patch
#1 of this series.
> +
> + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT)
> + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow");
> +
> + if (!EXT4_SB(sb)->s_journal)
> + return -ENODEV;
So this is where I would actually move the DRY_RUN flag check (and
then I'd move the pr_info_ratelimited check after the DRY_RUN check).
Cheers,
- Ted
On Thu, May 13, 2021 at 02:09:26PM -0400, Theodore Ts'o wrote:
> > +
> > + err = jbd2_journal_bmap(journal, log_offset, &block_start);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> > + return err;
> > + }
>
> We could get rid of this, and instead make sure block_start is initialized
> to ~((unsigned long long) 0). Then in the loop we can do...
>
> > +
> > + /*
> > + * use block_start - 1 to meet check for contiguous with previous region:
> > + * phys_block == block_stop + 1
> > + */
> > + block_stop = block_start - 1;
> > +
> > + for (block = log_offset; block < journal->j_total_len; block++) {
> > + err = jbd2_journal_bmap(journal, block, &phys_block);
> > + if (err) {
> > + printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> > + return err;
> > + }
>
> if (block_start == ~((unsigned long long) 0)) {
> block_start = phys_block;
> block_Stop = block_start - 1;
> }
>
> > +
> > + if (block == journal->j_total_len - 1) {
> > + block_stop = phys_block;
> > + } else if (phys_block == block_stop + 1) {
> > + block_stop++;
> > + continue;
> > + }
> > +
> > + /*
> > + * not contiguous with prior physical block or this is last
> > + * block of journal, take care of the region
> > + */
> > + byte_start = block_start * journal->j_blocksize;
> > + byte_stop = block_stop * journal->j_blocksize;
> > + byte_count = (block_stop - block_start + 1) *
> > + journal->j_blocksize;
> > +
> > + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> > + byte_start, byte_stop);
> > +
> > + if (flags & JBD2_ERASE_FLAG_DISCARD) {
> > + err = blkdev_issue_discard(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + } else if (flags & JBD2_ERASE_FLAG_ZEROOUT) {
> > + err = blkdev_issue_zeroout(journal->j_dev,
> > + byte_start >> SECTOR_SHIFT,
> > + byte_count >> SECTOR_SHIFT,
> > + GFP_NOFS, 0);
> > + }
> > +
> > + if (unlikely(err != 0)) {
> > + printk(KERN_ERR "JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
> > + err, block_start, block_stop);
> > + return err;
> > + }
> > +
> > + block_start = phys_block;
> > + block_stop = phys_block;
>
> Is this right? When we initialized the loop, above, block_stop was
> set to block_start-1 (where block_start == phys_block). So I think it
> might be more correct to replace the above two lines with:
>
> block_start = ~((unsigned long long) 0);
>
> ... and then let block_start and block_stop be initialized in a single
> place. Do you agree? Does this make sense to you?
I just tried this and it actually wouldn't work with this setup because
phys_block would be set after the new call to bmap instead of keeping the value
from the end of the prior loop. However, I have reworked the code using the
general idea of the block_start reset you proposed and I will submit this in
the next version.
Thanks,
Leah
>
> - Ted
On Thu, May 13, 2021 at 04:27:23PM -0400, Leah Rumancik wrote:
>
> Just to make sure I understand correctly, the explicit __u32 is critical
> due to the size being read in by the ioctl, specifically through
> copy_from_user? When do you switch from __u32 to unsigned long? I don't
> see the __* types being carried throughout.
What I was thinking was something like this:
static int ext4_foo_ioctl(struct super_block *sb, unsigned long arg)
{
__u32 flags;
unsigned int f = 0;
if (get_user(flags, (__u32 __user *) arg))
return -EFAULT;
if (flags & EXT4_IOC_FOO_AAA)
f |= JBD2_FLAGS_FOO_AAA;
if (flags & EXT4_IOC_FOO_BBB)
f |= JBD2_FLAGS_FOO_BBB;
...
jbd_foo(sb, f);
}
So there are two separate flag namespaces, one exported to userspace
which is __u32, and which are the EXT4_IOC_FOO_*, defined in
fs/ext4/ext4.h. (Actually, we should move the ext4 ioctls to a header
file like under include/uapi/..., maybe include/uapi/linux/ext4.h, or
include/uapi/fs/ext4.h, but that's a different patch series.)
And then there is a second flag namespace, JBD2_FLAGS_FOO_*, which is
defined in include/linux/jbd2.h, which is an unsigned int, and which
is a kernel-internal interface.
> (Also, just got Darrick's reply about the 32 vs. 64. Yes, originally
> went with 64 because there was an argument for it. I believe the 32
> is likely sufficient, but I don't feel that strongly about this matter)
Sure, but for EXT4_IOC_SHUTDOWN? We have 3 flags defined today, and
I'm not convinced we'll have 12 more flags defined, let alone enough
that we really need a 64-bit flags.
> > What you probably want is something like:
> >
> > #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001
> > #define JBD2_JOURNAL_FLUSH_ZEROOUT 0x0002
> > #define JBD2_JOURNAL_FLUSH_VALID 0x0003
> >
>
> Why call them JBD2_JOURNAL_FLUSH* instead of JBD2_JOURNAL_ERASE* since
> they get passed directly through to the erase function? I feel like it
> would be weird if someone wanted to use the erase function directly but
> had to use JBD2_JOURNAL_FLUSH* flags.
The erase function is a static function that's not exported outside of
fs/jbd2. The interface which is exposed to kernel callers outside of
fs/jbd2 is jbd2_journal_flush(). Since that's the public interface,
the flags should be similarly defined that way.
Cheers,
- Ted