From: Eric Biggers <[email protected]
This series fixes several bugs in the fast-commit feature.
Patch 6 may be the most controversial patch of this series, since it
would make old kernels unable to replay fast-commit journals created by
new kernels. I'd appreciate any thoughts on whether that's okay. I can
drop that patch if needed.
I've tested that this series doesn't introduce any regressions with
'gce-xfstests -c ext4/fast_commit -g auto'. Note that ext4/039,
ext4/053, and generic/475 fail both before and after.
Eric Biggers (7):
ext4: disable fast-commit of encrypted dir operations
ext4: don't set up encryption key during jbd2 transaction
ext4: fix leaking uninitialized memory in fast-commit journal
ext4: add missing validation of fast-commit record lengths
ext4: fix unaligned memory access in ext4_fc_reserve_space()
ext4: fix off-by-one errors in fast-commit block filling
ext4: simplify fast-commit CRC calculation
fs/ext4/ext4.h | 4 +-
fs/ext4/fast_commit.c | 203 ++++++++++++++++++------------------
fs/ext4/fast_commit.h | 3 +-
fs/ext4/namei.c | 44 ++++----
include/trace/events/ext4.h | 7 +-
5 files changed, 132 insertions(+), 129 deletions(-)
base-commit: 089d1c31224e6b266ece3ee555a3ea2c9acbe5c2
--
2.38.1
From: Eric Biggers <[email protected]>
Commit a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature")
extended the scope of the transaction in ext4_unlink() too far, making
it include the call to ext4_find_entry(). However, ext4_find_entry()
can deadlock when called from within a transaction because it may need
to set up the directory's encryption key.
Fix this by restoring the transaction to its original scope.
Reported-by: [email protected]
Fixes: a80f7fcf1867 ("ext4: fixup ext4_fc_track_* functions' signature")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/ext4.h | 4 ++--
fs/ext4/fast_commit.c | 2 +-
fs/ext4/namei.c | 44 +++++++++++++++++++++++--------------------
3 files changed, 27 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d5453852f98e..acaa951ef1886 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3619,8 +3619,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 6d98f2b39b775..da0c8228cf9c3 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1397,7 +1397,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
return 0;
}
- ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ ret = __ext4_unlink(old_parent, &entry, inode, NULL);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c08c0aba18836..a789ea9b61a09 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3204,14 +3204,20 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return retval;
}
-int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode,
+ struct dentry *dentry /* NULL during fast_commit recovery */)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ handle_t *handle;
int skip_remove_dentry = 0;
+ /*
+ * Keep this outside the transaction; it may have to set up the
+ * directory's encryption key, which isn't GFP_NOFS-safe.
+ */
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
- goto out;
+ goto out_bh;
+ }
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto out;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto out;
+ goto out_handle;
} else {
retval = 0;
}
@@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
-
-out:
+ if (dentry && !retval)
+ ext4_fc_track_unlink(handle, dentry);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
- handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- goto out_trace;
- }
-
- retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
- if (!retval)
- ext4_fc_track_unlink(handle, dentry);
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- if (handle)
- ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
--
2.38.1
From: Eric Biggers <[email protected]>
fast-commit of create, link, and unlink operations in encrypted
directories is completely broken because the unencrypted filenames are
being written to the fast-commit journal instead of the encrypted
filenames. These operations can't be replayed, as encryption keys
aren't present at journal replay time. It is also an information leak.
Until if/when we can get this working properly, make encrypted directory
operations ineligible for fast-commit.
Note that fast-commit operations on encrypted regular files continue to
be allowed, as they seem to work.
Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 41 ++++++++++++++++++++++---------------
fs/ext4/fast_commit.h | 1 +
include/trace/events/ext4.h | 7 +++++--
3 files changed, 31 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0f6d0a80467d7..6d98f2b39b775 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
struct __track_dentry_update_args *dentry_update =
(struct __track_dentry_update_args *)arg;
struct dentry *dentry = dentry_update->dentry;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
mutex_unlock(&ei->i_fc_lock);
+
+ if (IS_ENCRYPTED(dir)) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -EOPNOTSUPP;
+ }
+
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
node->fcd_op = dentry_update->op;
- node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_parent = dir->i_ino;
node->fcd_ino = inode->i_ino;
if (dentry->d_name.len > DNAME_INLINE_LEN) {
node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
- ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -2249,17 +2258,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}
-static const char *fc_ineligible_reasons[] = {
- "Extended attributes changed",
- "Cross rename",
- "Journal flag changed",
- "Insufficient memory",
- "Swap boot",
- "Resize",
- "Dir renamed",
- "Falloc range op",
- "Data journalling",
- "FC Commit Failed"
+static const char * const fc_ineligible_reasons[] = {
+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+ [EXT4_FC_REASON_RESIZE] = "Resize",
+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index a6154c3ed1357..256f2ad27204a 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -96,6 +96,7 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_ENCRYPTED_FILENAME,
EXT4_FC_REASON_MAX
};
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 229e8fae66a34..ced95fec3367d 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -104,6 +104,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
+TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
#define show_fc_reason(reason) \
@@ -116,7 +117,8 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
{ EXT4_FC_REASON_RESIZE, "RESIZE"}, \
{ EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \
{ EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \
- { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"})
+ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \
+ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"})
TRACE_EVENT(ext4_other_inode_update_time,
TP_PROTO(struct inode *inode, ino_t orig_ino),
@@ -2764,7 +2766,7 @@ TRACE_EVENT(ext4_fc_stats,
),
TP_printk("dev %d,%d fc ineligible reasons:\n"
- "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u "
+ "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
"num_commits:%lu, ineligible: %lu, numblks: %lu",
MAJOR(__entry->dev), MINOR(__entry->dev),
FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
@@ -2776,6 +2778,7 @@ TRACE_EVENT(ext4_fc_stats,
FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
+ FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
__entry->fc_commits, __entry->fc_ineligible_commits,
__entry->fc_numblks)
);
--
2.38.1
From: Eric Biggers <[email protected]>
When space at the end of fast-commit journal blocks is unused, make sure
to zero it out so that uninitialized memory is not leaked to disk.
Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da0c8228cf9c3..1e8be05542396 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -737,6 +737,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
*crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
if (pad_len > 0)
ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+ /* Don't leak uninitialized memory in the unused last byte. */
+ *((u8 *)(tl + 1) + pad_len) = 0;
+
ext4_fc_submit_bh(sb, false);
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
@@ -793,6 +796,8 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
dst += sizeof(tail.fc_tid);
tail.fc_crc = cpu_to_le32(crc);
ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+ dst += sizeof(tail.fc_crc);
+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
ext4_fc_submit_bh(sb, true);
--
2.38.1
From: Eric Biggers <[email protected]>
Validate the inode and filename lengths in fast-commit journal records
so that a malicious fast-commit journal cannot cause a crash by having
invalid values for these. Also validate EXT4_FC_TAG_DEL_RANGE.
Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 38 +++++++++++++++++++-------------------
fs/ext4/fast_commit.h | 2 +-
2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 1e8be05542396..d5ad4b2b235d6 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1991,32 +1991,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
-static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
- u8 *val, u8 *end)
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+ int tag, int len)
{
- if (val + tl->fc_len > end)
- return false;
-
- /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
- * journal rescan before do CRC check. Other tags length check will
- * rely on CRC check.
- */
- switch (tl->fc_tag) {
+ switch (tag) {
case EXT4_FC_TAG_ADD_RANGE:
- return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
- case EXT4_FC_TAG_TAIL:
- return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
- case EXT4_FC_TAG_HEAD:
- return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ return len == sizeof(struct ext4_fc_add_range);
case EXT4_FC_TAG_DEL_RANGE:
+ return len == sizeof(struct ext4_fc_del_range);
+ case EXT4_FC_TAG_CREAT:
case EXT4_FC_TAG_LINK:
case EXT4_FC_TAG_UNLINK:
- case EXT4_FC_TAG_CREAT:
+ len -= sizeof(struct ext4_fc_dentry_info);
+ return len >= 1 && len <= EXT4_NAME_LEN;
case EXT4_FC_TAG_INODE:
+ len -= sizeof(struct ext4_fc_inode);
+ return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+ len <= sbi->s_inode_size;
case EXT4_FC_TAG_PAD:
- default:
- return true;
+ return true; /* padding can have any length */
+ case EXT4_FC_TAG_TAIL:
+ return len >= sizeof(struct ext4_fc_tail);
+ case EXT4_FC_TAG_HEAD:
+ return len == sizeof(struct ext4_fc_head);
}
+ return false;
}
/*
@@ -2079,7 +2078,8 @@ static int ext4_fc_replay_scan(journal_t *journal,
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ if (tl.fc_len > end - val ||
+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED;
goto out_err;
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 256f2ad27204a..2fadb2c4780c8 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info {
__u8 fc_dname[];
};
-/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+/* Value structure for EXT4_FC_TAG_INODE. */
struct ext4_fc_inode {
__le32 fc_ino;
__u8 fc_raw_inode[];
--
2.38.1
From: Eric Biggers <[email protected]>
As is done elsewhere in the file, build the struct ext4_fc_tl on the
stack and memcpy() it into the buffer, rather than directly writing it
to a potentially-unaligned location in the buffer.
Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 39 +++++++++++++++++++++------------------
1 file changed, 21 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index d5ad4b2b235d6..892fa7c7a768b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -675,6 +675,15 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
/* Ext4 commit path routines */
+/* memcpy to fc reserved space and update CRC */
+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
+ int len, u32 *crc)
+{
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
+ return memcpy(dst, src, len);
+}
+
/* memzero and update CRC */
static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
u32 *crc)
@@ -700,12 +709,13 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
*/
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
- struct ext4_fc_tl *tl;
+ struct ext4_fc_tl tl;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh;
int bsize = sbi->s_journal->j_blocksize;
int ret, off = sbi->s_fc_bytes % bsize;
int pad_len;
+ u8 *dst;
/*
* After allocating len, we should have space at least for a 0 byte
@@ -729,16 +739,18 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
return sbi->s_fc_bh->b_data + off;
}
/* Need to add PAD tag */
- tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
- tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ dst = sbi->s_fc_bh->b_data + off;
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
- tl->fc_len = cpu_to_le16(pad_len);
- if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
- if (pad_len > 0)
- ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+ tl.fc_len = cpu_to_le16(pad_len);
+ ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ dst += EXT4_FC_TAG_BASE_LEN;
+ if (pad_len > 0) {
+ ext4_fc_memzero(sb, dst, pad_len, crc);
+ dst += pad_len;
+ }
/* Don't leak uninitialized memory in the unused last byte. */
- *((u8 *)(tl + 1) + pad_len) = 0;
+ *dst = 0;
ext4_fc_submit_bh(sb, false);
@@ -750,15 +762,6 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
return sbi->s_fc_bh->b_data;
}
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
- int len, u32 *crc)
-{
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
- return memcpy(dst, src, len);
-}
-
/*
* Complete a fast commit by writing tail tag.
*
--
2.38.1
From: Eric Biggers <[email protected]>
Instead of checksumming each field as it is added to the block, just
checksum each block before it is written. This is simpler, and also
much more efficient.
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 54 +++++++++++++------------------------------
1 file changed, 16 insertions(+), 38 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 7ed71c652f67f..1da85f6261166 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -675,27 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
/* Ext4 commit path routines */
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
- int len, u32 *crc)
-{
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
- return memcpy(dst, src, len);
-}
-
-/* memzero and update CRC */
-static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
- u32 *crc)
-{
- void *ret;
-
- ret = memset(dst, 0, len);
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
- return ret;
-}
-
/*
* Allocate len bytes on a fast commit buffer.
*
@@ -749,8 +728,9 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
tl.fc_len = cpu_to_le16(remaining);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
- ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+ *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
ext4_fc_submit_bh(sb, false);
@@ -792,13 +772,15 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
- ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
+ crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
- ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+ memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
dst += sizeof(tail.fc_crc);
memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
@@ -824,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
- ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
return true;
}
@@ -847,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ memcpy(dst, &fcd, sizeof(fcd));
dst += sizeof(fcd);
- ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+ memcpy(dst, fc_dentry->fcd_name.name, dlen);
return true;
}
@@ -889,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
if (!dst)
goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
- goto err;
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- goto err;
+ memcpy(dst, &fc_inode, sizeof(fc_inode));
dst += sizeof(fc_inode);
- if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
- inode_len, crc))
- goto err;
+ memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
ret = 0;
err:
brelse(iloc.bh);
--
2.38.1
From: Eric Biggers <[email protected]>
Due to several different off-by-one errors, or perhaps due to a late
change in design that wasn't fully reflected in the code that was
actually merged, there are several very strange constraints on how
fast-commit blocks are filled with tlv entries:
- tlvs must start at least 10 bytes before the end of the block, even
though the minimum tlv length is 8. Otherwise, the replay code will
ignore them. (BUG: ext4_fc_reserve_space() could violate this
requirement if called with a len of blocksize - 9 or blocksize - 8.
Fortunately, this doesn't seem to happen currently.)
- tlvs must end at least 1 byte before the end of the block. Otherwise
the replay code will consider them to be invalid. This quirk
contributed to a bug (fixed by an earlier commit) where uninitialized
memory was being leaked to disk in the last byte of blocks.
Also, strangely these constraints don't apply to the replay code in
e2fsprogs, which will accept any tlvs in the blocks (with no bounds
checks at all, but that is a separate issue...).
Given that this all seems to be a bug, let's fix it by just filling
blocks with tlv entries in the natural way.
Note that old kernels will be unable to replay fast-commit journals
created by kernels that have this commit.
Fixes: aa75f4d3daae ("ext4: main fast-commit commit path")
Cc: <[email protected]> # v5.10+
Signed-off-by: Eric Biggers <[email protected]>
---
fs/ext4/fast_commit.c | 66 +++++++++++++++++++++----------------------
1 file changed, 33 insertions(+), 33 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 892fa7c7a768b..7ed71c652f67f 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -714,43 +714,43 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
struct buffer_head *bh;
int bsize = sbi->s_journal->j_blocksize;
int ret, off = sbi->s_fc_bytes % bsize;
- int pad_len;
+ int remaining;
u8 *dst;
/*
- * After allocating len, we should have space at least for a 0 byte
- * padding.
+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+ * cannot fulfill the request.
*/
- if (len + EXT4_FC_TAG_BASE_LEN > bsize)
+ if (len > bsize - EXT4_FC_TAG_BASE_LEN)
return NULL;
- if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
- /*
- * Only allocate from current buffer if we have enough space for
- * this request AND we have space to add a zero byte padding.
- */
- if (!sbi->s_fc_bh) {
- ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
- if (ret)
- return NULL;
- sbi->s_fc_bh = bh;
- }
- sbi->s_fc_bytes += len;
- return sbi->s_fc_bh->b_data + off;
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
}
- /* Need to add PAD tag */
dst = sbi->s_fc_bh->b_data + off;
+
+ /*
+ * Allocate the bytes in the current block if we can do so while still
+ * leaving enough space for a PAD tlv.
+ */
+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (len <= remaining) {
+ sbi->s_fc_bytes += len;
+ return dst;
+ }
+
+ /*
+ * Else, terminate the current block with a PAD tlv, then allocate a new
+ * block and allocate the bytes at the start of that new block.
+ */
+
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
- tl.fc_len = cpu_to_le16(pad_len);
+ tl.fc_len = cpu_to_le16(remaining);
ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
- dst += EXT4_FC_TAG_BASE_LEN;
- if (pad_len > 0) {
- ext4_fc_memzero(sb, dst, pad_len, crc);
- dst += pad_len;
- }
- /* Don't leak uninitialized memory in the unused last byte. */
- *dst = 0;
+ ext4_fc_memzero(sb, dst + EXT4_FC_TAG_BASE_LEN, remaining, crc);
ext4_fc_submit_bh(sb, false);
@@ -758,7 +758,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
if (ret)
return NULL;
sbi->s_fc_bh = bh;
- sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ sbi->s_fc_bytes += bsize - off + len;
return sbi->s_fc_bh->b_data;
}
@@ -789,7 +789,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
off = sbi->s_fc_bytes % bsize;
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
- tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
@@ -2056,7 +2056,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
state = &sbi->s_fc_replay_state;
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
if (state->fc_replay_expected_off == 0) {
state->fc_cur_tag = 0;
@@ -2077,7 +2077,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
@@ -2195,9 +2195,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
#endif
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
--
2.38.1
On Sun, Nov 06, 2022 at 02:48:34PM -0800, Eric Biggers wrote:
>
> This series fixes several bugs in the fast-commit feature.
>
> Patch 6 may be the most controversial patch of this series, since it
> would make old kernels unable to replay fast-commit journals created by
> new kernels. I'd appreciate any thoughts on whether that's okay. I can
> drop that patch if needed.
>
> I've tested that this series doesn't introduce any regressions with
> 'gce-xfstests -c ext4/fast_commit -g auto'. Note that ext4/039,
> ext4/053, and generic/475 fail both before and after.
>
> Eric Biggers (7):
> ext4: disable fast-commit of encrypted dir operations
> ext4: don't set up encryption key during jbd2 transaction
> ext4: fix leaking uninitialized memory in fast-commit journal
> ext4: add missing validation of fast-commit record lengths
> ext4: fix unaligned memory access in ext4_fc_reserve_space()
> ext4: fix off-by-one errors in fast-commit block filling
> ext4: simplify fast-commit CRC calculation
>
> fs/ext4/ext4.h | 4 +-
> fs/ext4/fast_commit.c | 203 ++++++++++++++++++------------------
> fs/ext4/fast_commit.h | 3 +-
> fs/ext4/namei.c | 44 ++++----
> include/trace/events/ext4.h | 7 +-
> 5 files changed, 132 insertions(+), 129 deletions(-)
>
>
> base-commit: 089d1c31224e6b266ece3ee555a3ea2c9acbe5c2
Any thoughts on this patch series?
- Eric
On Tue, Nov 15, 2022 at 05:18:11PM -0800, Eric Biggers wrote:
> On Sun, Nov 06, 2022 at 02:48:34PM -0800, Eric Biggers wrote:
> >
> > This series fixes several bugs in the fast-commit feature.
> >
> > Patch 6 may be the most controversial patch of this series, since it
> > would make old kernels unable to replay fast-commit journals created by
> > new kernels. I'd appreciate any thoughts on whether that's okay. I can
> > drop that patch if needed.
> >
> > I've tested that this series doesn't introduce any regressions with
> > 'gce-xfstests -c ext4/fast_commit -g auto'. Note that ext4/039,
> > ext4/053, and generic/475 fail both before and after.
> >
> > Eric Biggers (7):
> > ext4: disable fast-commit of encrypted dir operations
> > ext4: don't set up encryption key during jbd2 transaction
> > ext4: fix leaking uninitialized memory in fast-commit journal
> > ext4: add missing validation of fast-commit record lengths
> > ext4: fix unaligned memory access in ext4_fc_reserve_space()
> > ext4: fix off-by-one errors in fast-commit block filling
> > ext4: simplify fast-commit CRC calculation
> >
> > fs/ext4/ext4.h | 4 +-
> > fs/ext4/fast_commit.c | 203 ++++++++++++++++++------------------
> > fs/ext4/fast_commit.h | 3 +-
> > fs/ext4/namei.c | 44 ++++----
> > include/trace/events/ext4.h | 7 +-
> > 5 files changed, 132 insertions(+), 129 deletions(-)
> >
> >
> > base-commit: 089d1c31224e6b266ece3ee555a3ea2c9acbe5c2
>
> Any thoughts on this patch series?
>
> - Eric
Ping?
- Eric
On Sun, Nov 06, 2022 at 02:48:34PM -0800, Eric Biggers wrote:
> From: Eric Biggers <[email protected]
>
> This series fixes several bugs in the fast-commit feature.
>
> Patch 6 may be the most controversial patch of this series, since it
> would make old kernels unable to replay fast-commit journals created by
> new kernels. I'd appreciate any thoughts on whether that's okay. I can
> drop that patch if needed.
Mumble. Normally, it's something we would avoid, since there aren't
that many users using fast commit, since it's not enabled by default.
And given that the off-by-one errors are bugs, an it's a question of
old kernels requiring a pretty buggy layout, the question is whether
it's worth it to do an explicit version / feature flag and support
both for some period of time.
I'm inclined to say no, and just let things slide, and instead make
sure that e2fsck can handle both the old and the new format, and let
that handle the fast commit replay if necessary.
Harshad, what do you think?
- Ted
On Tue, 6 Dec 2022 at 13:04, Theodore Ts'o <[email protected]> wrote:
>
> On Sun, Nov 06, 2022 at 02:48:34PM -0800, Eric Biggers wrote:
> > From: Eric Biggers <[email protected]
> >
> > This series fixes several bugs in the fast-commit feature.
> >
> > Patch 6 may be the most controversial patch of this series, since it
> > would make old kernels unable to replay fast-commit journals created by
> > new kernels. I'd appreciate any thoughts on whether that's okay. I can
> > drop that patch if needed.
>
> Mumble. Normally, it's something we would avoid, since there aren't
> that many users using fast commit, since it's not enabled by default.
> And given that the off-by-one errors are bugs, an it's a question of
> old kernels requiring a pretty buggy layout, the question is whether
> it's worth it to do an explicit version / feature flag and support
> both for some period of time.
>
> I'm inclined to say no, and just let things slide, and instead make
> sure that e2fsck can handle both the old and the new format, and let
> that handle the fast commit replay if necessary.
>
> Harshad, what do you think?
I agree. Making kernel replay backward compatible would complicate the
replay code without adding that much value (since there aren't that
many users and fast commit isn't enabled by default). So, having the
ability in e2fsck to do replays should suffice in this case.
- Harshad
>
> - Ted